APPENDIX
This appendix includes all of the execution steps used in analyzing the data, from preprocessing with the raw public data (A0) to statistical analysis (A1 to A23).
In order to reproduce these steps, it is necessary to place all of the following files in the same directory as 000_analysis_script.qmd. Moreover, a revn.lock file and a Docker image were provided to ensure that the analysis can be executed within the same software environment used by the authors.
- 999_public_data.rds
- 999_irish_sponsored_public.csv
- 111_ethnicity_labels_translated.csv
- 111_ethnicity_open_answers_recoded.csv
- 111_education_recoded.csv
- 111_sex_open_answers_recoded.csv
- 111_country_variables.csv
- 111_ip_repeated.csv
- 111_administrative_location.csv
- 111_response_ids_botnets.csv
- 111_income_recoded.rds
- 111_generic_version_country.csv
- 222_codebook.xlsx
- 777_countries_documentation (folder)
- revn.lock (optional)
- qmd-env-arm64.tar (optional)
To conduct the analyses (A1 to A23) without running the preprocessing steps, place 999_clean_data.rds in the same directory, run the Setup section, and start at any Analyses section. Each analysis section is independent and can be run separately.
Setup
Set working directory
Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(char = c(
# data import and cleaning
"readr",
"readxl",
"qualtRics",
"dplyr",
"tidyr",
"stringr",
"purrr",
"tibble",
"forcats",
"janitor",
"labelled",
"lubridate",
# visualization
"ggplot2",
"MetBrewer",
"scales",
"ggridges",
"ggtext",
"ggfx",
"ggh4x",
"ggeffects",
"cowplot",
"grid",
"gridExtra",
"gtable",
"ggplotify",
"see",
"corrplot",
"visdat",
"ggflags",
"showtext",
"sysfonts",
# modeling and statistics
"mgcv",
"lme4",
"survey",
"metafor",
"binom",
"lsr",
"car",
"emmeans",
"interactions",
"rmcorr",
"psych",
"Hmisc",
"weights",
# SEM and reliability
"lavaan",
"semTools",
# reporting and tables
"broom.mixed",
"kableExtra",
"sjPlot",
"flextable",
"officer",
"report",
"performance",
"reactable",
# spatial data and maps
"sf",
"rnaturalearth",
"rnaturalearthdata",
"leaflet",
"leaflet.extras",
"leaflet.extras2",
# utilities
"countrycode",
"htmltools",
"rlang",
"sessioninfo"
))Define global settings
options(
# Remove scientific notation
scipen = 999,
width = 150,
# Clean up dplyr messages
dplyr.summarise.inform = FALSE)
# Set up theme for plots
sysfonts::font_add_google("Inter")
showtext::showtext_auto()
theme_gmh <-
ggplot2::theme_minimal(base_family = "Inter", base_size = 12) +
ggplot2::theme(
text = element_text(family = "Inter", colour = "#051520"),
axis.text.y = element_text(color = "#051520"),
axis.text.x = element_text(
color = "#051520",
margin = margin(t = 1),
face = "bold"
),
axis.title.x = element_text(color = "#051520", face = "bold"),
axis.title.y = element_text(color = "#051520", face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line.x = element_line(colour = "#051520", linewidth = 0.4),
plot.margin = margin(6, 6, 6, 6),
plot.subtitle = ggplot2::element_text(color = "#051520"),
plot.background = ggplot2::element_rect(fill = "transparent", color = NA),
panel.background = ggplot2::element_rect(fill = "transparent", color = NA),
legend.background = ggplot2::element_rect(fill = "transparent", color = NA)
)
ggplot2::theme_set(theme_gmh)
# Print variables in a tidy way
table_label <- function(col) {
# extract what is after $ in dataframe$column
name <- sub(".*\\$(.+)", "\\1", deparse(substitute(col)))
# extract the label of the given column
lab <- attr(col, "label")
# print header wih column name and label
cat(sprintf("$%s\n%s\n", name, lab))
# print table output with NA counts
tbl <- table(col, useNA = "always")
names(dimnames(tbl)) <- NULL
print(tbl)
# print the class of the column
cat("Class:", paste(class(col), collapse = ", "), "\n")
}
# Print a pretty table
print_reactable <- function(data, sorted_col, width) {
reactable::reactable(
data,
pagination = FALSE,
height = 650,
width = width,
defaultSorted = sorted_col,
defaultSortOrder = "asc",
searchable = TRUE,
striped = TRUE,
compact = TRUE,
highlight = TRUE,
defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
defaultColDef = reactable::colDef(
vAlign = "center",
headerVAlign = "bottom",
class = "cell",
headerClass = "header"
)
)
}
# Print pretty summaries
print_summ <- function(model, design, var, term) {
format_p <- function(p) {
if (p < 0.001) {
return("< .001")
}
base::format(base::round(p, 3), nsmall = 3)
}
term_test <- survey::regTermTest(model, term)
svy_resid <-
update(design, .resid = stats::residuals(model, type = "response"))
var_y <-
survey::svyvar(stats::as.formula(paste0("~", var)), design = svy_resid)[1]
var_e <- survey::svyvar(~.resid, design = svy_resid)[1]
r2 <- 1 - (var_e / var_y)
cohens_f <- base::sqrt(r2 / (1 - r2))
percent_var_explained <- r2 * 100
tibble::tibble(
Ward_F =
base::format(base::round(base::as.numeric(term_test$Ftest[1]), 2), nsmall = 2),
df1 = term_test$df,
df2 = term_test$ddf,
p = format_p(term_test$p),
r2 = base::format(base::round(r2, 4), nsmall = 4),
cohens_f = base::format(base::round(cohens_f, 4), nsmall = 4),
percent_var_explained =
base::format(base::round(percent_var_explained, 4), nsmall = 4)
)
}
# Calculate weighted correlation
weighted_corr <- function(dat, var_x, var_y, multiple = FALSE) {
if (!isTRUE(multiple)) {
var_x <- rlang::ensym(var_x)
var_y <- rlang::ensym(var_y)
design <- survey::svydesign(
ids = ~ 1,
weights = ~ ps_weight,
data = dat
)
est <- jtools::svycor(
stats::as.formula(
paste0("~", rlang::as_name(var_x), " + ", rlang::as_name(var_y))),
design,
sig.stats = TRUE,
bootn = 1000,
mean1 = TRUE
)
data.frame(
r = as.character(format(round(est$cors[2], 3), nsmall = 3)),
t = as.character(format(round(est$t.values[2], 2), nsmall = 2)),
p = dplyr::if_else(
est$p.values[2] < 0.01, "<.001",
as.character(format(round(est$p.values[2], 3), nsmall = 3)))
)
} else {
outcome_sym <- rlang::ensym(var_x)
items_val <- rlang::eval_tidy(rlang::enquo(var_y))
design <- survey::svydesign(ids = ~1, weights = ~ps_weight, data = dat)
results <- purrr::map_dfr(items_val, function(item_name) {
f <- stats::as.formula(paste0("~", rlang::as_name(outcome_sym), " + ", item_name))
est <- jtools::svycor(f, design, sig.stats = TRUE, bootn = 1000, mean1 = TRUE)
r_val <- est$cors[2]
t_val <- est$t.values[2]
p_val <- est$p.values[2]
data.frame(
item = item_name,
r = as.character(format(round(est$cors[2], 3), nsmall = 3)),
t = as.character(format(round(est$t.values[2], 2), nsmall = 2)),
p = dplyr::if_else(
est$p.values[2] < 0.01, "<.001",
as.character(format(round(est$p.values[2], 3), nsmall = 3)))
)
})
return(results)
}
}
# Define MPWB items and labels
mpwb_items <- c(
"mpwb_positive_relationships",
"mpwb_meaning",
"mpwb_competence",
"mpwb_engagement",
"mpwb_self_esteem",
"mpwb_optimism",
"mpwb_positive_emotion",
"mpwb_emotional_stability",
"mpwb_resilience",
"mpwb_vitality"
)
mpwb_labels <- c(
mpwb_positive_relationships = "Positive relationships",
mpwb_meaning = "Meaning",
mpwb_competence = "Competence",
mpwb_engagement = "Engagement",
mpwb_self_esteem = "Self-esteem",
mpwb_optimism = "Optimism",
mpwb_positive_emotion = "Positive emotion",
mpwb_emotional_stability = "Emotional stability",
mpwb_resilience = "Resilience",
mpwb_vitality = "Vitality"
)
phq4_items <- c("phq_interest", "phq_down", "gad_anxious", "gad_worry")
# Define EU countries
eu_countries <- c(
"Austria",
"Belgium",
"Bulgaria",
"Croatia",
"Cyprus",
"Czech Republic",
"Denmark",
"Estonia",
"Finland",
"France",
"Germany",
"Greece",
"Hungary",
"Ireland",
"Italy",
"Latvia",
"Netherlands",
"Poland",
"Portugal",
"Romania",
"Slovakia",
"Slovenia",
"Spain",
"Sweden"
)
# List of countries whose weight scores were replaced by 1.
flagged_countries <-
c("Moldova", "Romania", "Nigeria", "Montenegro", "Angola",
"Morocco", "Uruguay", "Paraguay", "Greece", "Iran",
"Hungary", "Kosovo", "Yemen", "Chile", "Uganda")Load data
The data collection began on June 2, 2025, with a soft-launch phase. The survey’s time zone was set to New York City. Due to time zone differences, some responses show a date of June 1, 2025, even though it was already June 2 in the collaborators’ local time. Some collaborators were residing in countries different from their target country.
# Raw public dataset
df_pub_raw <- base::readRDS("999_public_data.rds")
# View number of rows in the raw dataset
nrow(df_pub_raw)[1] 68311
# Cleaned dataset
df_gmh <- base::readRDS("999_cleaned_data.rds")
# Create general design
svy <- survey::svydesign(ids = ~ 1, weights = ~ ps_weight, data = df_gmh)
# View number of rows in the cleaned dataset
nrow(df_gmh)[1] 53799
A0. Data Preprocessing
A0.1. Cleaning the dataset
Rename columns
df_pub <- df_pub_raw |>
dplyr::rename(
duration_sec = `Duration (in seconds)`,
mpwb_competence = Q5,
mpwb_emotional_stability = Q7,
mpwb_engagement = Q9,
mpwb_meaning = Q11,
mpwb_optimism = Q13,
mpwb_positive_emotion = Q15,
mpwb_positive_relationships = Q17,
mpwb_resilience = Q19,
mpwb_self_esteem = Q21,
mpwb_vitality = Q23,
life_satisfaction = Q29,
income_orig = Q31,
income_text_orig = Q31_10_TEXT,
household_size = Q32,
birth_year_orig = Q25,
sex_orig = Q26,
education_orig = Q27,
employment_orig = Q28,
ethnicity_citizenship_orig = Q30,
assets_orig = Q34,
debts_orig = Q33,
bot_check = Q43,
followup = Q35,
phq_interest = Q36_1,
phq_down = Q36_2,
gad_anxious = Q36_3,
gad_worry = Q36_4,
childhood_SES = Q37,
fin_outlook = Q38,
fin_outlook_conf = Q39,
attention_care = Q40,
work_arrangement = Q41
) |>
dplyr::relocate(Q_Language, .after = UserLanguage) |>
# Overview of the data
dplyr::glimpse(width = 100)Rows: 68,311
Columns: 45
$ StartDate <dttm> 2025-06-01 07:14:43, 2025-06-01 07:33:44, 2025-06-01 19:24:40…
$ EndDate <dttm> 2025-06-01 07:31:52, 2025-06-01 07:41:06, 2025-06-01 19:30:50…
$ Status <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Progress <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 10…
$ duration_sec <dbl> 1028, 442, 370, 426, 512, 344, 341, 744, 582, 1006, 233, 173, …
$ Finished <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ RecordedDate <dttm> 2025-06-01 07:31:52, 2025-06-01 07:41:07, 2025-06-01 19:30:52…
$ ResponseId <chr> "R_2i29tTIFUyYilqv", "R_2nemeLi6AnL1uNP", "R_3LqMY0lbugweTSh",…
$ UserLanguage <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "PT-BRA", "P…
$ Q_Language <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "PT-BRA", "P…
$ mpwb_competence <dbl> 6, 5, 5, 5, 5, 5, 5, 7, 5, 5, 4, 7, 5, 5, 5, 4, 6, 6, 5, 5, 7,…
$ mpwb_emotional_stability <dbl> 6, 3, 5, 5, 5, 4, 5, 7, 6, 7, 5, 5, 4, 7, 5, 4, 7, 6, 4, 5, 5,…
$ mpwb_engagement <dbl> 6, 6, 5, 6, 5, 4, 5, 7, 3, 5, 5, 6, 7, 7, 5, 4, 4, 5, 5, 3, 7,…
$ mpwb_meaning <dbl> 6, 3, 5, 6, 4, 4, 6, 7, 5, 6, 4, 5, 4, 6, 5, 4, 7, 5, 4, 5, 7,…
$ mpwb_optimism <dbl> 7, 5, 5, 6, 5, 3, 7, 7, 6, 7, 4, 6, 5, 6, 5, 4, 7, 5, 6, 6, 7,…
$ mpwb_positive_emotion <dbl> 5, 3, 5, 6, 5, 7, 6, 7, 7, 5, 4, 5, 6, 6, 4, 4, 7, 5, 5, 5, 7,…
$ mpwb_positive_relationships <dbl> 5, 5, 5, 4, 7, 7, 6, 7, 5, 7, 5, 6, 5, 6, 5, 4, 6, 5, 4, 3, 7,…
$ mpwb_resilience <dbl> 5, 5, 5, 6, 5, 3, 6, 7, 5, 7, 4, 6, 4, 6, 4, 4, 7, 7, 4, 3, 6,…
$ mpwb_self_esteem <dbl> 6, 5, 5, 7, 3, 4, 7, 7, 6, 5, 4, 7, 5, 6, 5, 4, 6, 6, 6, 6, 7,…
$ mpwb_vitality <dbl> 5, 1, 4, 5, 3, 4, 5, 7, 6, 7, 4, 6, 4, 6, 5, 4, 5, 5, 5, 5, 5,…
$ life_satisfaction <dbl> 7, 6, 9, 8, 5, 8, 7, 10, 8, 10, 0, 8, 6, 9, 6, 5, 10, 8, 7, 8,…
$ income_orig <dbl> 7, 9, 6, 9, 7, 8, 5, 4, 5, 7, 1, 5, 4, 6, 7, 6, 10, 10, 6, 3, …
$ income_text_orig <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ household_size <dbl> 3, 5, 1, 4, 5, 4, 2, 4, 2, 1, 1, 6, 5, 6, 3, 4, 10, 7, 1, 12, …
$ birth_year_orig <chr> "1989", "1984", "1971", "1986", "1993", "2005", "1986", "1975"…
$ sex_orig <dbl> 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2,…
$ education_orig <dbl> 6, 6, 5, 8, 6, 3, 7, 5, 7, 6, 5, 8, 5, 7, 7, 6, 5, 6, 5, 4, 7,…
$ employment_orig <chr> "3", "3", "3", "3", "8", "2,8", "3", "3", "1", "6", "8", "3", …
$ ethnicity_citizenship_orig <chr> "3,6,10", "1,10", "5,10", "5,10", "3,10", "5,10", "1,10", "1,1…
$ assets_orig <chr> "5", "2", "20.000", "1000000", "5000", "0,00", "250000", "1,00…
$ debts_orig <chr> "10000000", "2", "18000", "0", "125000", "0,00", "0", "1,000.0…
$ bot_check <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ followup <dbl> 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1,…
$ phq_interest <dbl> NA, 2, NA, 2, 3, 6, 1, NA, 2, 1, NA, 2, NA, 1, 2, 2, NA, 1, 2,…
$ phq_down <dbl> NA, 3, NA, 1, 3, 4, 1, NA, 1, 1, NA, 2, NA, 2, 2, 2, NA, 1, 2,…
$ gad_anxious <dbl> NA, 2, NA, 2, 3, 7, 2, NA, 2, 1, NA, 3, NA, 2, 3, 2, NA, 1, 1,…
$ gad_worry <dbl> NA, 1, NA, 2, 3, 7, 1, NA, 1, 1, NA, 3, NA, 2, 3, 2, NA, 1, 2,…
$ childhood_SES <dbl> NA, 4, NA, 4, 2, 4, 1, NA, 1, 4, NA, 2, NA, 4, 4, 3, NA, 3, 4,…
$ fin_outlook <dbl> NA, 3, NA, 4, 4, 5, 5, NA, 5, 4, NA, 5, NA, 4, 5, 5, NA, 4, 5,…
$ fin_outlook_conf <dbl> NA, 10, NA, 8, 8, 10, 10, NA, 10, 8, NA, 8, NA, 9, 7, 8, NA, 1…
$ attention_care <dbl> NA, 5, NA, 5, 7, 4, 5, NA, 5, 6, NA, 5, NA, 5, 6, 5, NA, 5, 4,…
$ work_arrangement <dbl> NA, 4, NA, 3, NA, 1, 2, NA, 5, NA, NA, 1, NA, 1, 3, 4, NA, 1, …
$ br <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ bs <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ CoreMPWB_DO <chr> "Q4|Q23|Q6|Q21|Q8|Q15|Q10|Q19|Q12|Q9|Q14|Q5|Q16|Q7|Q18|Q17|Q20…
Identification of the Countries
$UserLanguage
User Language
AM-ARM AM-ETH AR-ARE AR-BHR AR-DZA AR-EGY AR-KWT AR-LBN AR-MAR AR-OMN AR-QAT AR-SAU AR-TCD AR-YEM BG-BGR BN-BGD BS-BIH CNR-MNE
334 303 66 100 203 322 106 416 302 520 503 296 7 577 393 536 642 358
CS-CZE DA-DNK DE-AUT DE-CHE DE-DEU EL-CYP EL-GRC EN EN-ARE EN-AUS EN-BHR EN-CAN EN-EGY EN-EST EN-ETH EN-GBR EN-GEN EN-GEO
267 338 685 452 1008 218 532 5549 270 605 111 535 547 9 249 852 69 54
EN-HKG EN-HUN EN-IDN EN-IND EN-IRL EN-KOR EN-KWT EN-MNG EN-MYS EN-NGA EN-NLD EN-NOR EN-PAK EN-PHL EN-QAT EN-SGP EN-UGA EN-YEM
17 6 12 921 461 11 209 40 203 721 161 26 347 2280 23 298 332 3
EN-ZAF EN-ZMB EN-ZWE ES-ARG ES-BOL ES-CHL ES-ECU ES-ESP ES-MEX ES-PER ES-PRY ES-URY ES-US ET-EST FA-IRN FI-FIN FIL-PHL FR-BEL
279 34 169 769 341 240 1075 729 1164 1031 205 815 159 2393 292 275 1276 70
FR-CAN FR-CHE FR-FRA FR-MDG FR-SEN FR-TCD HE-ISR HI-IND HR-HRV HU-HUN ID-IDN IT-CHE IT-ITA JA-JPN KA-GEO KK-KAZ KO-KOR KY-KGZ
339 292 1175 169 211 185 437 706 455 729 1489 79 566 549 450 131 481 166
LV-LVA MK-MKD MN-MNG MS-MYS NL-BEL NL-NLD NO-NOR PL-POL PT-AGO PT-BRA PT-MOZ PT-PRT PT-TLS RO-MDA RO-ROU RU-KAZ RU-KGZ RU-RUS
1023 268 327 613 261 287 483 1288 329 2094 154 579 277 511 861 656 209 1322
RU-UZB SK-SVK SL-SVN SN-ZWE SQI-ALB SQI-XKX SR-SRB SR-XKX SV-SWE TH-THA TR-TUR UK-UKR UR-PAK UZ-UZB ZH-CHN ZH-HKG ZH-TWN <NA>
119 724 746 106 2284 1371 420 2 1149 440 682 749 160 543 2523 220 201 0
Class: character
# Create column with country names mapped from UserLanguage
country_map <- c(
"SQI-ALB" = "Albania",
"AR-DZA" = "Algeria",
"PT-AGO" = "Angola",
"ES-ARG" = "Argentina",
"AM-ARM" = "Armenia",
"EN-AUS" = "Australia",
"DE-AUT" = "Austria",
"AR-BHR" = "Bahrain",
"EN-BHR" = "Bahrain",
"BN-BGD" = "Bangladesh",
"FR-BEL" = "Belgium",
"NL-BEL" = "Belgium",
"ES-BOL" = "Bolivia",
"BS-BIH" = "Bosnia-Herzegovina",
"PT-BRA" = "Brazil",
"BG-BGR" = "Bulgaria",
"EN-CAN" = "Canada",
"FR-CAN" = "Canada",
"AR-TCD" = "Chad",
"FR-TCD" = "Chad",
"ES-CHL" = "Chile",
"ZH-CHN" = "China",
"HR-HRV" = "Croatia",
"EL-CYP" = "Cyprus",
"CS-CZE" = "Czech Republic",
"DA-DNK" = "Denmark",
"ES-ECU" = "Ecuador",
"AR-EGY" = "Egypt",
"EN-EGY" = "Egypt",
"EN-EST" = "Estonia",
"ET-EST" = "Estonia",
"AM-ETH" = "Ethiopia",
"EN-ETH" = "Ethiopia",
"FR-FRA" = "France",
"FI-FIN" = "Finland",
"EN-GEO" = "Georgia",
"KA-GEO" = "Georgia",
"DE-DEU" = "Germany",
"EL-GRC" = "Greece",
"EN-HKG" = "Hong Kong",
"ZH-HKG" = "Hong Kong",
"EN-HUN" = "Hungary",
"HU-HUN" = "Hungary",
"EN-IND" = "India",
"HI-IND" = "India",
"ID-IDN" = "Indonesia",
"EN-IDN" = "Indonesia",
"FA-IRN" = "Iran",
"EN-IRL" = "Ireland",
"HE-ISR" = "Israel",
"IT-ITA" = "Italy",
"JA-JPN" = "Japan",
"KK-KAZ" = "Kazakhstan",
"RU-KAZ" = "Kazakhstan",
"EN-KOR" = "Republic of Korea",
"KO-KOR" = "Republic of Korea",
"SQI-XKX" = "Kosovo",
"SR-XKX" = "Kosovo",
"AR-KWT" = "Kuwait",
"EN-KWT" = "Kuwait",
"KY-KGZ" = "Kyrgyzstan",
"RU-KGZ" = "Kyrgyzstan",
"LV-LVA" = "Latvia",
"AR-LBN" = "Lebanon",
"MK-MKD" = "North Macedonia",
"FR-MDG" = "Madagascar",
"MS-MYS" = "Malaysia",
"EN-MYS" = "Malaysia",
"ES-MEX" = "Mexico",
"RO-MDA" = "Moldova",
"EN-MNG" = "Mongolia",
"MN-MNG" = "Mongolia",
"CNR-MNE" = "Montenegro",
"AR-MAR" = "Morocco",
"PT-MOZ" = "Mozambique",
"NL-NLD" = "Netherlands",
"EN-NLD" = "Netherlands",
"EN-NGA" = "Nigeria",
"EN-NOR" = "Norway",
"NO-NOR" = "Norway",
"AR-OMN" = "Oman",
"UR-PAK" = "Pakistan",
"EN-PAK" = "Pakistan",
"ES-PRY" = "Paraguay",
"ES-PER" = "Peru",
"EN-PHL" = "Philippines",
"FIL-PHL" = "Philippines",
"PL-POL" = "Poland",
"PT-PRT" = "Portugal",
"AR-QAT" = "Qatar",
"EN-QAT" = "Qatar",
"RO-ROU" = "Romania",
"RU-RUS" = "Russia",
"AR-SAU" = "Saudi Arabia",
"FR-SEN" = "Senegal",
"SR-SRB" = "Serbia",
"EN-SGP" = "Singapore",
"SK-SVK" = "Slovakia",
"SL-SVN" = "Slovenia",
"EN-ZAF" = "South Africa",
"ES-ESP" = "Spain",
"SV-SWE" = "Sweden",
"FR-CHE" = "Switzerland",
"DE-CHE" = "Switzerland",
"IT-CHE" = "Switzerland",
"ZH-TWN" = "Taiwan",
"TH-THA" = "Thailand",
"PT-TLS" = "Timor-Leste",
"TR-TUR" = "Türkiye",
"EN-UGA" = "Uganda",
"UK-UKR" = "Ukraine",
"AR-ARE" = "UAE",
"EN-ARE" = "UAE",
"EN-GBR" = "UK",
"EN" = "USA",
"ES-US" = "USA",
"ES-URY" = "Uruguay",
"RU-UZB" = "Uzbekistan",
"UZ-UZB" = "Uzbekistan",
"AR-YEM" = "Yemen",
"EN-YEM" = "Yemen",
"EN-ZMB" = "Zambia",
"EN-ZWE" = "Zimbabwe",
"SN-ZWE" = "Zimbabwe",
"EN-GEN" = "Global"
)
df_pub <- df_pub |>
dplyr::mutate(
# Identify country based on UserLanguage
country = country_map[UserLanguage],
# Transform UserLanguage to ISO codes
# (the last three characters identify the ISO3 code except USA)
iso3 = stringr::str_extract(UserLanguage, "[A-Z]{3}$"),
# Clean the code for the USA
iso3 = dplyr::case_when(
UserLanguage == "EN" ~ "USA",
UserLanguage == "ES-US" ~ "USA",
UserLanguage == "EN-GEN" ~ NA_character_,
TRUE ~ iso3
),
# Convert ISO3 to ISO2
iso2 = countrycode::countrycode(
iso3,
origin = "iso3c",
destination = "iso2c",
custom_match = c("XKX" = "XK"))
) |>
dplyr::relocate(country, iso2, iso3, .after = UserLanguage)
# Sanity check: Cross-tab of countries by language
df_pub |>
dplyr::count(country, iso2, iso3, sort = TRUE) |>
dplyr::filter(!is.na(country)) |>
print_reactable(sorted_col = "country", width = 500)Global Version Processing
A global version of the survey was created to ensure people from countries that weren’t specifically targeted in this study or whose native languages weren’t provided could still take part. This version didn’t have any changes made for specific countries. There was only an open-text field for the income item, and all financial items asked for values in USD.
# Identify country and citizenship
gen_ident <-
readr::read_csv("111_generic_version_country.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 69
Columns: 2
$ ResponseId <chr> "R_4CJBLtS3qvvRTf7", "R_2EGKdy6ce2zvQls", "R_8f1msPTljX0SGpw", "R_7f1bVmdQG7qh…
$ country_gen <chr> "Australia", "Austria", "Austria", "Bangladesh", "Bangladesh", "Bangladesh", "…
[1] 68311
df_pub <- df_pub |>
dplyr::left_join(gen_ident, by = "ResponseId") |>
dplyr::relocate(country_gen, .after = country)
nrow(df_pub)[1] 68311
# Sanity check: View the country counts of global version participants
# It was not possible to identify the country for one participant
df_pub |> dplyr::filter(UserLanguage == "EN-GEN") |>
dplyr::group_by(country_gen) |>
dplyr::summarise(n = dplyr::n()) |>
base::print(n = Inf)# A tibble: 34 × 2
country_gen n
<chr> <int>
1 Afghanistan 1
2 Australia 1
3 Austria 2
4 Bangladesh 4
5 Belgium 7
6 Bhutan 2
7 Colombia 3
8 Democratic Republic of the Congo 1
9 Dominican Republic 1
10 Finland 1
11 France 3
12 Germany 2
13 Guatemala 1
14 Honduras 1
15 India 2
16 Italy 1
17 Kenya 1
18 Korea 1
19 Lebanon 7
20 Namibia 1
21 Nepal 1
22 New Zealand 2
23 Norway 1
24 Oman 5
25 Pakistan 1
26 Philippines 2
27 Sri Lanka 1
28 Sweden 1
29 Thailand 1
30 UAE 4
31 UK 2
32 Zambia 3
33 Zimbabwe 1
34 <NA> 1
Exclusion of Countries with Small Sample Sizes
We excluded the Global version and Zambia because the sample sizes were not sufficiently large. The Global version does not have the country-specific changes that were made in the target countries, consequently those answers can’t be compared. Zambia is not included because it only has 34 participants, which is less than the 120 required.
# View countries with less than 120 participants
df_pub |>
dplyr::group_by(country) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::filter(n < 120) |>
dplyr::arrange(n)# A tibble: 2 × 2
country n
<chr> <int>
1 Zambia 34
2 Global 69
[1] 68311
df_pub <- df_pub |>
dplyr::filter(UserLanguage != "EN-GEN", UserLanguage != "EN-ZMB") |>
dplyr::select(-country_gen)
nrow(df_pub)[1] 68208
Location Validation
location <-
readr::read_csv("111_administrative_location.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 68,208
Columns: 7
$ ResponseId <chr> "R_4rOh5csuvsUlmsF", "R_9Hk3KD5bE28n9bn", "R_8plovBuEUJfYQRO", "R_5miAsDI8Pi7…
$ loc_country <chr> "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", …
$ loc_resident <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ loc_admin_1 <chr> "Yerevan", "Yerevan", "Yerevan", "Syunik", "Yerevan", "Yerevan", "Yerevan", "…
$ loc_admin_2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ lat <dbl> 40.18720, 40.18720, 40.18720, 39.50899, 40.18720, 40.18720, 40.18720, 40.1872…
$ long <dbl> 44.51521, 44.51521, 44.51521, 46.34389, 44.51521, 44.51521, 44.51521, 44.5152…
[1] 68208
df_pub <- df_pub |>
dplyr::left_join(location, by = "ResponseId") |>
dplyr::relocate(
loc_resident,
loc_country,
loc_admin_1,
loc_admin_2,
lat,
long,
.after = Q_Language
)
# Sanity check: Number of rows should remain the same
nrow(df_pub)[1] 68208
# Sanity check: How many missing location validations are in the dataset?
nrow(df_pub |> dplyr::filter(is.na(loc_resident)))[1] 0
# Sanity check: How many missing latitudes are in the dataset?
nrow(df_pub |> dplyr::filter(is.na(lat) & !is.na(loc_country)))[1] 0
# Sanity check: View the counts of location validation
df_pub |> dplyr::filter(loc_resident == 0) |>
dplyr::group_by(country, loc_resident) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "country", width = 500)# Sanity check: View the counts of administrative level units per country
df_pub |> dplyr::filter(loc_resident == 1) |>
tidyr::pivot_longer(
cols = c(loc_admin_1, loc_admin_2),
names_to = "admin_level",
values_to = "value"
) |>
dplyr::summarise(
unique_n = dplyr::n_distinct(value, na.rm = TRUE),
.by = c(country, admin_level)
) |>
print_reactable(sorted_col = "country", width = 500)Merge Sponsored Participants from Ireland’s Team
All participants from the Ireland’s sponsored dataset completed the survey and only the mandatory items were included. Some variables had different options than those in the main dataset.
# Merge the sponsored Irish participants
df_irl_raw <-
readr::read_csv("999_irish_sponsored_public.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 1,200
Columns: 21
$ utcdateandtime <chr> "04/07/2025 09:30", "04/07/2025 09:35", "04/07/2025 09:36", …
$ participantprivateid <dbl> 13767545, 13767552, 13767549, 13767554, 13767544, 13767547, …
$ branchpbkg <chr> "male", "female", "female", "female", "female", "female", "m…
$ qid12object4response <chr> "Agree", "Strongly Agree", "Strongly Disagree", "Agree", "St…
$ qid13object6response <chr> "Strongly Agree", "Agree", "Agree", "Disagree", "Agree", "Ag…
$ qid14object8response <chr> "Strongly Agree", "Strongly Agree", "Strongly Disagree", "Ag…
$ qid15object9response <chr> "Strongly Agree", "Absolutely Agree", "Agree", "Agree", "Agr…
$ qid16object10response <chr> "Agree", "Strongly Agree", "Agree", "Disagree", "Agree", "Di…
$ qid17object11response <chr> "Strongly Agree", "Absolutely Agree", "Agree", "Agree", "Agr…
$ qid18object12response <chr> "Agree", "Absolutely Agree", "Strongly Agree", "Strongly Agr…
$ qid19object13response <chr> "Agree", "Agree", "Agree", "Strongly Agree", "Strongly Agree…
$ qid20object14response <chr> "Agree", "Strongly Agree", "Agree", "Agree", "Strongly Agree…
$ qid20object15response <chr> "Agree", "Agree", "Neutral", "Disagree", "Agree", "Disagree"…
$ qid29object17response <dbl> 8, 7, 7, 6, 8, 7, 7, 8, 6, 6, 2, 6, 7, 3, 6, 7, 8, 10, 9, 7,…
$ born_locationobject5response <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "__ot…
$ educationobject8response <chr> "Leaving Certificate", "Degree", "Master's", "Technical or V…
$ employmentobject9response <chr> "Employed full-time", "Employed full-time", "Seeking Employm…
$ incomeobject12quantised <dbl> 6, 6, 7, 3, 7, 2, 5, 6, 10, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5,…
$ incomeobject12response <chr> "€67,001 - €85,000", "€67,001 - €85,000", "€85,001 - €105,00…
$ P1ageobject377Response <dbl> 54, 24, 46, 23, 46, 49, 70, 37, 51, 34, 56, 31, 29, 48, 24, …
$ numhouseholdobject375Response <dbl> 1, 4, 2, 4, 3, 3, 3, 2, 4, 2, 5, 1, 1, 2, 1, 1, 2, 2, 4, 2, …
df_irl <- df_irl_raw |>
dplyr::transmute(
StartDate_irl = utcdateandtime,
ResponseId = as.character(participantprivateid),
sex_irl = branchpbkg,
mpwb_competence = qid12object4response,
mpwb_emotional_stability = qid13object6response,
mpwb_engagement = qid14object8response,
mpwb_meaning = qid15object9response,
mpwb_optimism = qid16object10response,
mpwb_positive_emotion = qid17object11response,
mpwb_positive_relationships = qid18object12response,
mpwb_resilience = qid19object13response,
mpwb_self_esteem = qid20object14response,
mpwb_vitality = qid20object15response,
life_satisfaction = qid29object17response,
ethnicity_citizenship_irl = born_locationobject5response,
# The education options are slightly different from the version
# used for Ireland non-sponsored participants
education_irl = educationobject8response,
# The employment options are slightly different from the version
# used for Ireland non-sponsored participants
employment_irl = employmentobject9response,
# The income brackets are slightly different from the version
# used for Ireland non-sponsored participants
income_irl = incomeobject12quantised,
household_size = numhouseholdobject375Response,
age = P1ageobject377Response
) |>
dplyr::mutate(
sex_orig = dplyr::case_when(
sex_irl == "male" ~ 1,
sex_irl == "female" ~ 2,
sex_irl == "other" ~ 3,
TRUE ~ NA_integer_
),
ethnicity_citizenship_orig = dplyr::case_when(
# The only options given were "Ireland" and "__other"
ethnicity_citizenship_irl == "Ireland" ~ "10",
ethnicity_citizenship_irl == "__other" ~ "11",
TRUE ~ NA_character_
),
education_orig = dplyr::case_when(
education_irl == "Less than Junior (Inter) Cert" ~ 1,
education_irl == "Junior (Inter) Certificate or Equivalent" ~ 2,
education_irl == "Leaving Certificate" ~ 3,
education_irl == "Technical or Vocational Certificate" ~ 4,
education_irl == "Diploma" ~ 5,
education_irl == "Degree" ~ 6,
education_irl == "Master's" ~ 7,
education_irl == "Doctorate" ~ 8,
TRUE ~ NA_integer_
),
employment_orig = dplyr::case_when(
employment_irl == "Employed full-time" ~ "3",
employment_irl == "Employed part-time" ~ "4",
employment_irl == "Student" ~ "1",
employment_irl == "Seeking Employment/Unemployed" ~ "8",
employment_irl == "Homemaker/Carer" ~ "7",
employment_irl == "Unable to Work" ~ "9",
employment_irl == "Retired" ~ "6",
# The option below is not in the original coding scheme
employment_irl == "Self-employed" ~ NA_character_,
TRUE ~ NA_character_
),
income_orig = dplyr::if_else(
# The option 10 = "Prefer not to say" is recoded to NA
income_irl == 10,
NA_integer_,
income_irl
),
Q_Language = "EN-IRL-sponsored",
UserLanguage = "EN-IRL-sponsored",
iso3 = "IRL",
iso2 = "IE",
country = "Ireland",
loc_resident = 1,
loc_country = "Ireland",
lat = 53.3861632,
long = -10.5940283,
irl = 1
) |>
# We need to recode the MPWB items from text to numerical
dplyr::mutate(
dplyr::across(
dplyr::all_of(mpwb_items),
~ as.numeric(base::factor(
.,
levels = c(
# first level will be coded as 1
"Absolutely Disagree",
# second level will be coded as 2, etc.
"Strongly Disagree",
"Disagree",
"Neutral",
"Agree",
"Strongly Agree",
"Absolutely Agree"
)
)))) |>
dplyr::glimpse(width = 100)Rows: 1,200
Columns: 35
$ StartDate_irl <chr> "04/07/2025 09:30", "04/07/2025 09:35", "04/07/2025 09:36", "0…
$ ResponseId <chr> "13767545", "13767552", "13767549", "13767554", "13767544", "1…
$ sex_irl <chr> "male", "female", "female", "female", "female", "female", "mal…
$ mpwb_competence <dbl> 5, 6, 2, 5, 6, 5, 5, 5, 4, 5, 4, 4, 6, 7, 7, 5, 6, 6, 5, 7, 2,…
$ mpwb_emotional_stability <dbl> 6, 5, 5, 3, 5, 5, 6, 5, 3, 5, 4, 5, 5, 6, 7, 5, 5, 7, 6, 5, 2,…
$ mpwb_engagement <dbl> 6, 6, 2, 5, 5, 5, 6, 3, 5, 5, 5, 5, 5, 7, 7, 6, 5, 4, 5, 4, 2,…
$ mpwb_meaning <dbl> 6, 7, 5, 5, 5, 4, 5, 5, 4, 5, 2, 5, 5, 7, 7, 5, 7, 7, 5, 7, 3,…
$ mpwb_optimism <dbl> 5, 6, 5, 3, 5, 3, 6, 5, 4, 5, 3, 5, 5, 7, 7, 5, 7, 7, 6, 7, 3,…
$ mpwb_positive_emotion <dbl> 6, 7, 5, 5, 5, 4, 6, 5, 4, 5, 3, 5, 5, 5, 6, 5, 6, 7, 7, 5, 3,…
$ mpwb_positive_relationships <dbl> 5, 7, 6, 6, 5, 5, 7, 5, 5, 6, 3, 6, 5, 4, 6, 6, 5, 6, 6, 5, 2,…
$ mpwb_resilience <dbl> 5, 5, 5, 6, 6, 4, 5, 5, 5, 4, 5, 4, 5, 5, 5, 6, 6, 7, 6, 6, 3,…
$ mpwb_self_esteem <dbl> 5, 6, 5, 5, 6, 4, 5, 5, 4, 5, 3, 5, 5, 1, 4, 5, 6, 7, 6, 5, 2,…
$ mpwb_vitality <dbl> 5, 5, 4, 3, 5, 3, 5, 5, 4, 5, 3, 4, 4, 1, 4, 5, 6, 7, 4, 4, 2,…
$ life_satisfaction <dbl> 8, 7, 7, 6, 8, 7, 7, 8, 6, 6, 2, 6, 7, 3, 6, 7, 8, 10, 9, 7, 2…
$ ethnicity_citizenship_irl <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "__othe…
$ education_irl <chr> "Leaving Certificate", "Degree", "Master's", "Technical or Voc…
$ employment_irl <chr> "Employed full-time", "Employed full-time", "Seeking Employmen…
$ income_irl <dbl> 6, 6, 7, 3, 7, 2, 5, 6, 10, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5, 1…
$ household_size <dbl> 1, 4, 2, 4, 3, 3, 3, 2, 4, 2, 5, 1, 1, 2, 1, 1, 2, 2, 4, 2, 3,…
$ age <dbl> 54, 24, 46, 23, 46, 49, 70, 37, 51, 34, 56, 31, 29, 48, 24, 34…
$ sex_orig <dbl> 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2,…
$ ethnicity_citizenship_orig <chr> "10", "10", "10", "10", "10", "11", "10", "10", "10", "10", "1…
$ education_orig <dbl> 3, 6, 7, 4, 6, 6, 7, 4, 5, 6, 7, 3, 7, 7, 7, 6, 3, 7, 6, 3, 6,…
$ employment_orig <chr> "3", "3", "8", "3", "3", "7", "6", "3", "7", "3", "7", "4", "3…
$ income_orig <dbl> 6, 6, 7, 3, 7, 2, 5, 6, NA, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5, 1…
$ Q_Language <chr> "EN-IRL-sponsored", "EN-IRL-sponsored", "EN-IRL-sponsored", "E…
$ UserLanguage <chr> "EN-IRL-sponsored", "EN-IRL-sponsored", "EN-IRL-sponsored", "E…
$ iso3 <chr> "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL",…
$ iso2 <chr> "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "I…
$ country <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "Irelan…
$ loc_resident <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ loc_country <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "Irelan…
$ lat <dbl> 53.38616, 53.38616, 53.38616, 53.38616, 53.38616, 53.38616, 53…
$ long <dbl> -10.59403, -10.59403, -10.59403, -10.59403, -10.59403, -10.594…
$ irl <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# Sanity check: View the counts of each option
base::table(df_irl$mpwb_competence, df_irl_raw$qid12object4response, useNA = "always")
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 18 0 0 0 0 0 0
2 0 0 0 0 0 0 22 0
3 0 0 0 59 0 0 0 0
4 0 0 0 0 262 0 0 0
5 0 0 498 0 0 0 0 0
6 0 0 0 0 0 229 0 0
7 112 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 20 0 0 0 0 0 0
2 0 0 0 0 0 0 39 0
3 0 0 0 148 0 0 0 0
4 0 0 0 0 231 0 0 0
5 0 0 490 0 0 0 0 0
6 0 0 0 0 0 192 0 0
7 80 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 11 0 0 0 0 0 0
2 0 0 0 0 0 0 18 0
3 0 0 0 111 0 0 0 0
4 0 0 0 0 362 0 0 0
5 0 0 480 0 0 0 0 0
6 0 0 0 0 0 157 0 0
7 61 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 24 0 0 0 0 0 0
2 0 0 0 0 0 0 26 0
3 0 0 0 82 0 0 0 0
4 0 0 0 0 264 0 0 0
5 0 0 479 0 0 0 0 0
6 0 0 0 0 0 213 0 0
7 112 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 21 1 8 3 2 4 0
2 0 2 0 15 13 1 8 0
3 0 0 21 36 45 2 4 0
4 4 2 111 20 144 17 2 0
5 15 0 302 7 47 51 1 0
6 26 0 52 0 9 95 0 0
7 66 0 20 0 2 20 1 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 1 15 1 2 0 1 5 0
2 0 0 5 2 5 2 6 0
3 0 7 17 22 30 3 7 0
4 4 1 85 40 112 14 7 0
5 20 1 293 14 99 79 1 0
6 25 0 64 2 15 82 0 0
7 62 0 14 0 3 32 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 26 0 0 0 0 0 0
2 0 0 0 0 0 0 22 0
3 0 0 0 84 0 0 0 0
4 0 0 0 0 179 0 0 0
5 0 0 498 0 0 0 0 0
6 0 0 0 0 0 235 0 0
7 156 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 23 0 0 0 0 0 0
2 0 0 0 0 0 0 34 0
3 0 0 0 145 0 0 0 0
4 0 0 0 0 248 0 0 0
5 0 0 504 0 0 0 0 0
6 0 0 0 0 0 159 0 0
7 87 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 39 0 0 0 0 0 0
2 0 0 0 0 0 0 34 0
3 0 0 0 110 0 0 0 0
4 0 0 0 0 264 0 0 0
5 0 0 460 0 0 0 0 0
6 0 0 0 0 0 176 0 0
7 117 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 55 0 0 0 0 0 0
2 0 0 0 0 0 0 87 0
3 0 0 0 247 0 0 0 0
4 0 0 0 0 325 0 0 0
5 0 0 330 0 0 0 0 0
6 0 0 0 0 0 108 0 0
7 48 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
# For the rows that are not in the Irish dataset
df_pub$irl <- 0
# Merge both datasets
df_merged <- dplyr::bind_rows(df_pub, df_irl) |>
dplyr::relocate(StartDate_irl, .after = StartDate) |>
dplyr::relocate(Q_Language, .after = ResponseId) |>
dplyr::relocate(age, .after = birth_year_orig) |>
dplyr::relocate(sex_irl, .after = sex_orig) |>
dplyr::relocate(ethnicity_citizenship_irl, .after = ethnicity_citizenship_orig) |>
dplyr::relocate(employment_irl, .after = employment_orig) |>
dplyr::relocate(education_irl, .after = education_orig) |>
dplyr::relocate(income_irl, .after = income_orig)
# Total sample size before individual exclusion criteria
nrow(df_merged)[1] 69408
# Sanity check:
# Is the sum of rows of both individual datasets equal to the merged dataset?
(length(df_irl$ResponseId) + length(df_pub$ResponseId)) ==
length(df_merged$ResponseId)[1] TRUE
MPWB
# Sanity check: View the counts of each option
for (i in mpwb_items) {
eval(parse(text = sprintf("table_label(df_pub$%s)", i)))
cat("\n")
}$mpwb_positive_relationships
I receive help and support from people I am close to when I need it.
1 2 3 4 5 6 7 <NA>
1751 1955 4594 7323 21636 14217 11715 5017
Class: numeric
$mpwb_meaning
I feel what I do in my life is valuable and worthwhile.
1 2 3 4 5 6 7 <NA>
1966 2448 5393 8627 21037 13474 10271 4992
Class: numeric
$mpwb_competence
I feel a sense of accomplishment from what I do.
1 2 3 4 5 6 7 <NA>
1907 2582 6319 9177 22141 13119 7974 4989
Class: numeric
$mpwb_engagement
I feel absorbed in what I am doing.
1 2 3 4 5 6 7 <NA>
1229 2041 6701 11039 22993 12211 7040 4954
Class: numeric
$mpwb_self_esteem
I feel positive about myself.
1 2 3 4 5 6 7 <NA>
2009 2703 6688 9186 21232 12895 8582 4913
Class: numeric
$mpwb_optimism
I am optimistic about my future.
1 2 3 4 5 6 7 <NA>
2770 3118 6488 10098 19560 11972 9255 4947
Class: numeric
$mpwb_positive_emotion
I feel happy.
1 2 3 4 5 6 7 <NA>
2114 2673 6253 12053 21446 11308 7423 4938
Class: numeric
$mpwb_emotional_stability
I feel calm and peaceful.
1 2 3 4 5 6 7 <NA>
2571 3882 10398 11446 19752 9404 5835 4920
Class: numeric
$mpwb_resilience
I recover quickly from things that go wrong in my life.
1 2 3 4 5 6 7 <NA>
2385 3916 10520 10552 21264 9366 5205 5000
Class: numeric
$mpwb_vitality
I feel full of energy.
1 2 3 4 5 6 7 <NA>
3422 5107 11610 12272 17740 8110 5015 4932
Class: numeric
df_merged <- df_merged |>
dplyr::rowwise() |>
dplyr::mutate(
# Identify participants that completed all MPWB items
mpwb_n = base::sum(!is.na(dplyr::c_across(dplyr::all_of(mpwb_items)))),
# Calculate variance, average and sum score of the MPWB items
# explicitly to only for participants who answered all MPWB items
mpwb_mean = dplyr::if_else(
mpwb_n == 10,
base::mean(dplyr::c_across(dplyr::all_of(mpwb_items))),
NA_real_
),
mpwb_var = dplyr::if_else(
mpwb_n == 10,
stats::var(dplyr::c_across(dplyr::all_of(mpwb_items))),
NA_real_
),
mpwb_sum = dplyr::if_else(
mpwb_n == 10,
base::sum(dplyr::c_across(dplyr::all_of(mpwb_items))),
NA_real_
)
) |>
# remove the rowwise computation
dplyr::ungroup() |>
# organise the variables positions
dplyr::relocate(mpwb_n:mpwb_sum, .after = mpwb_vitality)
# Sanity check: View the new MPWB variables
dplyr::glimpse(df_merged |> dplyr::select(dplyr::starts_with("mpwb_")), width = 100)Rows: 69,408
Columns: 14
$ mpwb_competence <dbl> 6, 5, 5, 5, 5, 5, 5, 7, 5, 5, 4, 7, 5, 5, 5, 4, 6, 6, 5, 5, 7,…
$ mpwb_emotional_stability <dbl> 6, 3, 5, 5, 5, 4, 5, 7, 6, 7, 5, 5, 4, 7, 5, 4, 7, 6, 4, 5, 5,…
$ mpwb_engagement <dbl> 6, 6, 5, 6, 5, 4, 5, 7, 3, 5, 5, 6, 7, 7, 5, 4, 4, 5, 5, 3, 7,…
$ mpwb_meaning <dbl> 6, 3, 5, 6, 4, 4, 6, 7, 5, 6, 4, 5, 4, 6, 5, 4, 7, 5, 4, 5, 7,…
$ mpwb_optimism <dbl> 7, 5, 5, 6, 5, 3, 7, 7, 6, 7, 4, 6, 5, 6, 5, 4, 7, 5, 6, 6, 7,…
$ mpwb_positive_emotion <dbl> 5, 3, 5, 6, 5, 7, 6, 7, 7, 5, 4, 5, 6, 6, 4, 4, 7, 5, 5, 5, 7,…
$ mpwb_positive_relationships <dbl> 5, 5, 5, 4, 7, 7, 6, 7, 5, 7, 5, 6, 5, 6, 5, 4, 6, 5, 4, 3, 7,…
$ mpwb_resilience <dbl> 5, 5, 5, 6, 5, 3, 6, 7, 5, 7, 4, 6, 4, 6, 4, 4, 7, 7, 4, 3, 6,…
$ mpwb_self_esteem <dbl> 6, 5, 5, 7, 3, 4, 7, 7, 6, 5, 4, 7, 5, 6, 5, 4, 6, 6, 6, 6, 7,…
$ mpwb_vitality <dbl> 5, 1, 4, 5, 3, 4, 5, 7, 6, 7, 4, 6, 4, 6, 5, 4, 5, 5, 5, 5, 5,…
$ mpwb_n <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
$ mpwb_mean <dbl> 5.7, 4.1, 4.9, 5.6, 4.7, 4.5, 5.8, 7.0, 5.4, 6.1, 4.3, 5.9, 4.…
$ mpwb_var <dbl> 0.4555556, 2.3222222, 0.1000000, 0.7111111, 1.3444444, 2.05555…
$ mpwb_sum <dbl> 57, 41, 49, 56, 47, 45, 58, 70, 54, 61, 43, 59, 49, 61, 48, 40…
# Sanity check: Are there missing values in the sum score when mpwb_n is 10?
base::table(df_merged$mpwb_n, is.na(df_merged$mpwb_sum), useNA = "always")
FALSE TRUE <NA>
0 0 3247 0
1 0 722 0
2 0 396 0
3 0 372 0
4 0 275 0
5 0 239 0
6 0 240 0
7 0 193 0
8 0 152 0
9 0 174 0
10 63398 0 0
<NA> 0 0 0
# Sanity check: Are there values in the var score when mpwb_n is not 10?
df_merged |> dplyr::filter(mpwb_n != 10 & (!is.na(mpwb_var))) |> base::nrow()[1] 0
Completion time
df_merged <- df_merged |>
dplyr::rowwise() |>
dplyr::mutate(
# Count how many items were answered (not NA) after the debts item
# (all items up to the debts item were forced-response)
n_items_after = base::sum(!is.na(dplyr::c_across(
c(
followup,
phq_interest,
phq_down,
gad_anxious,
gad_worry,
childhood_SES,
fin_outlook,
fin_outlook_conf,
attention_care,
work_arrangement
)
))),
# Calculate adjusted duration if the mandatory items were completed.
# Some survey versions have different variables of the same item,
# but all versions have 20 mandatory items before debts.
total_items = dplyr::if_else(
!is.na(debts_orig),
20 + n_items_after,
NA_real_),
duration_adj = dplyr::if_else(
!is.na(debts_orig),
duration_sec / total_items,
NA_real_)
) |>
dplyr::ungroup() |>
# organise the variables positions
dplyr::relocate(n_items_after:duration_adj, .after = duration_sec)
# Sanity check: View the new variables
dplyr::glimpse(df_merged |> dplyr::select(duration_sec:duration_adj), width = 100)Rows: 69,408
Columns: 4
$ duration_sec <dbl> 1028, 442, 370, 426, 512, 344, 341, 744, 582, 1006, 233, 173, 6735, 120, 270…
$ n_items_after <int> 1, 10, 1, 10, 9, 10, 10, 1, 10, 9, 1, 10, 1, 10, 10, 10, 1, 10, 9, 1, 9, 10,…
$ total_items <dbl> 21, 30, 21, 30, 29, 30, 30, 21, 30, 29, 21, 30, 21, 30, 30, 30, 21, 30, 29, …
$ duration_adj <dbl> 48.952381, 14.733333, 17.619048, 14.200000, 17.655172, 11.466667, 11.366667,…
# Sanity check: Is there a mismatch between n_items_after and total_items?
base::table(df_merged$n_items_after, df_merged$total_items, useNA = "always")
20 21 24 25 26 27 28 29 30 <NA>
0 1899 0 0 0 0 0 0 0 0 13381
1 0 14144 0 0 0 0 0 0 0 0
4 0 0 6 0 0 0 0 0 0 0
5 0 0 0 44 0 0 0 0 0 0
6 0 0 0 0 80 0 0 0 0 0
7 0 0 0 0 0 1 0 0 0 0
8 0 0 0 0 0 0 1313 0 0 0
9 0 0 0 0 0 0 0 9303 0 0
10 0 0 0 0 0 0 0 0 29237 0
<NA> 0 0 0 0 0 0 0 0 0 0
# Sanity check: Is there unexpected missing values in total_items?
df_merged |>
dplyr::summarise(
all_total_items_missing_when_debts_missing =
all(is.na(total_items[is.na(debts_orig)])),
any_total_items_present_when_debts_missing =
any(!is.na(total_items[is.na(debts_orig)])))# A tibble: 1 × 2
all_total_items_missing_when_debts_missing any_total_items_present_when_debts_missing
<lgl> <lgl>
1 TRUE FALSE
# Sanity check: Is there unexpected missing values in n_items_after?
base::table(df_merged$n_items_after, is.na(df_merged$debts_orig), useNA = "always")
FALSE TRUE <NA>
0 1899 13381 0
1 14144 0 0
4 6 0 0
5 44 0 0
6 80 0 0
7 1 0 0
8 1313 0 0
9 9303 0 0
10 29237 0 0
<NA> 0 0 0
# Sanity check: View the range of duration_adj
df_merged |>
dplyr::filter(!is.na(debts_orig)) |>
dplyr::summarise(
min_duration_adj = min(duration_adj, na.rm = TRUE),
max_duration_adj = max(duration_adj, na.rm = TRUE)
)# A tibble: 1 × 2
min_duration_adj max_duration_adj
<dbl> <dbl>
1 1.7 22309.
# Plot intra-individual variance vs time, faceted by country
ggplot2::ggplot(df_sub, ggplot2::aes(x = duration_adj, y = mpwb_var)) +
ggplot2::geom_point(alpha = 0.25, size = 0.8) +
ggplot2::geom_smooth(formula = y ~ x, method = "loess", se = TRUE) +
ggplot2::facet_wrap(~ country, scales = "free_y", ncol = 4, nrow = 25) +
ggplot2::labs(
x = "Duration adjusted (seconds)",
y = "Within-person variance across MPWB"
) +
ggplot2::theme(
strip.text = ggplot2::element_text(size = 9, face = "bold"),
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)
)ggplot2::ggplot(df_sub, ggplot2::aes(x = duration_adj, y = mpwb_sum)) +
ggplot2::geom_point(alpha = 0.2, size = 0.8) +
ggplot2::geom_smooth(formula = y ~ x, method = "loess", se = TRUE) +
ggplot2::facet_wrap(~ country, scales = "free", ncol = 4, nrow = 23) +
ggplot2::labs(
x = "Duration adjusted (seconds)",
y = "MPWB Sum"
) +
ggplot2::theme(
strip.text = ggplot2::element_text(size = 9, face = "bold"),
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)
)PHQ4
The original PHQ-4 has vague verbal anchors that could limit the comparability of results across languages. For example, the option “Several days” could be interpreted as 2-3 days or as more than 7 days in other languages. Since “One week” is not more than 7 days, we decided to recode it as “Several days”.
| Used anchors | Original anchors | Recoded value |
|---|---|---|
| Never (1) | Not at all (0) | 0 |
| Once or twice (1–2) (2) | Several days (1) | 1 |
| A few days (3–4) (3) | Several days (1) | 1 |
| Several days (4) | Several days (1) | 1 |
| One week (5) | Several days (1) | 1 |
| More than a week (6) | More than half the days (2) | 2 |
| Every day / nearly every day(7) | Nearly every day (3) | 3 |
# Sanity check: View the counts of each option
for (i in phq4_items) {
eval(parse(text = sprintf("table_label(df_merged$%s)", i)))
cat("\n")
}$phq_interest
Over the last 2 weeks, how often have you been bothered by the following problems? - Little interest or pleasure in doing things
1 2 3 4 5 6 7 <NA>
6413 13900 8172 3943 1230 1948 4378 29424
Class: numeric
$phq_down
Over the last 2 weeks, how often have you been bothered by the following problems? - Feeling down, depressed or hopeless
1 2 3 4 5 6 7 <NA>
8727 14216 6675 3362 1070 2068 3866 29424
Class: numeric
$gad_anxious
Over the last 2 weeks, how often have you been bothered by the following problems? - Feeling nervous, anxious or on edge
1 2 3 4 5 6 7 <NA>
5755 13322 7569 4542 1275 2303 5218 29424
Class: numeric
$gad_worry
Over the last 2 weeks, how often have you been bothered by the following problems? - Not being able to stop or control worrying
1 2 3 4 5 6 7 <NA>
11900 11965 5229 3206 1240 2142 4302 29424
Class: numeric
# Function to recode PHQ-4 items.
recode_phq <- function(i) {
dplyr::case_when(
i == 1 ~ 0,
i %in% 2:5 ~ 1,
i == 6 ~ 2,
i == 7 ~ 3,
TRUE ~ NA_real_
)
}
# Sanity check: Count missing values in PHQ-4 items when gad_worry is not missing
df_merged |>
dplyr::filter(!is.na(gad_worry)) |>
dplyr::summarise(
dplyr::across(dplyr::all_of(phq4_items), ~ base::sum(is.na(.x))),
n_total = dplyr::n()
)# A tibble: 1 × 5
phq_interest phq_down gad_anxious gad_worry n_total
<int> <int> <int> <int> <int>
1 0 0 0 0 39984
# Apply recoding and compute sum scores
# only for participants who answered all PHQ-4 items
# (i.e., not missing in the last PHQ item)
# gad_worry was the last item in the PHQ-4 matrix
df_merged <- df_merged |>
dplyr::mutate(
# Calculate the sums for phq2, gad2, and phq4
# only for participants who answered all PHQ-4 items
phq2_sum = dplyr::if_else(
!is.na(gad_worry),
phq_down + phq_interest,
NA_real_
),
gad2_sum = dplyr::if_else(
!is.na(gad_worry),
gad_worry + gad_anxious,
NA_real_
),
phq4_sum = phq2_sum + gad2_sum
) |>
dplyr::mutate(
# Apply the recoding function to the individual PHQ items
dplyr::across(all_of(phq4_items), recode_phq, .names = "{.col}_rec"),
# Calculate the sums for recoded phq2, gad2, and phq4
# only for participants who answered all PHQ-4 items
phq2_sum_rec = dplyr::if_else(
!is.na(gad_worry),
phq_down_rec + phq_interest_rec,
NA_real_
),
gad2_sum_rec = dplyr::if_else(
!is.na(gad_worry),
gad_worry_rec + gad_anxious_rec,
NA_real_
),
phq4_sum_rec = phq2_sum_rec + gad2_sum_rec,
# Create a variable with cut-off labels
phq4_cat = dplyr::case_when(
!is.na(phq4_sum_rec) & phq4_sum_rec >= 0 & phq4_sum_rec <= 2 ~ "Normal (0–2)",
!is.na(phq4_sum_rec) & phq4_sum_rec >= 3 & phq4_sum_rec <= 5 ~ "Mild (3–5)",
!is.na(phq4_sum_rec) & phq4_sum_rec >= 6 & phq4_sum_rec <= 8 ~ "Moderate (6–8)",
!is.na(phq4_sum_rec) & phq4_sum_rec >= 9 & phq4_sum_rec <= 12 ~ "Severe (9–12)",
# I expect character values, so NA_character_
TRUE ~ NA_character_
),
# Create variables for depression and anxiety screening,
# using the standard cut-off of 3 on the respective subscales
depression_screen = dplyr::case_when(
is.na(phq2_sum_rec) ~ NA_real_,
phq2_sum_rec >= 3 ~ 1,
TRUE ~ 0
),
anxiety_screen = dplyr::case_when(
is.na(gad2_sum_rec) ~ NA_real_,
gad2_sum_rec >= 3 ~ 1,
TRUE ~ 0
)
) |>
dplyr::relocate(phq2_sum:anxiety_screen, .after = gad_worry)
# Sanity checks (view the new variables)
dplyr::glimpse(
df_merged |>
dplyr::filter(!is.na(gad_worry)) |>
dplyr::select(phq_interest:anxiety_screen),
width = 100
)Rows: 39,984
Columns: 17
$ phq_interest <dbl> 2, 2, 3, 6, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 7, 6, 2, 1, 1, 2, 1, 2, 1, 3, …
$ phq_down <dbl> 3, 1, 3, 4, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 3, 6, 2, 1, 1, 2, 1, 2, 2, 1, …
$ gad_anxious <dbl> 2, 2, 3, 7, 2, 2, 1, 3, 2, 3, 2, 1, 1, 2, 3, 7, 2, 1, 1, 2, 1, 1, 1, 2, …
$ gad_worry <dbl> 1, 2, 3, 7, 1, 1, 1, 3, 2, 3, 2, 1, 2, 1, 1, 3, 2, 1, 1, 2, 1, 1, 2, 1, …
$ phq2_sum <dbl> 5, 3, 6, 10, 2, 3, 2, 4, 3, 4, 4, 2, 4, 3, 10, 12, 4, 2, 2, 4, 2, 4, 3, …
$ gad2_sum <dbl> 3, 4, 6, 14, 3, 3, 2, 6, 4, 6, 4, 2, 3, 3, 4, 10, 4, 2, 2, 4, 2, 2, 3, 3…
$ phq4_sum <dbl> 8, 7, 12, 24, 5, 6, 4, 10, 7, 10, 8, 4, 7, 6, 14, 22, 8, 4, 4, 8, 4, 6, …
$ phq_interest_rec <dbl> 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 3, 2, 1, 0, 0, 1, 0, 1, 0, 1, …
$ phq_down_rec <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 2, 1, 0, 0, 1, 0, 1, 1, 0, …
$ gad_anxious_rec <dbl> 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 3, 1, 0, 0, 1, 0, 0, 0, 1, …
$ gad_worry_rec <dbl> 0, 1, 1, 3, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, …
$ phq2_sum_rec <dbl> 2, 1, 2, 3, 0, 1, 0, 2, 1, 2, 2, 0, 2, 1, 4, 4, 2, 0, 0, 2, 0, 2, 1, 1, …
$ gad2_sum_rec <dbl> 1, 2, 2, 6, 1, 1, 0, 2, 2, 2, 2, 0, 1, 1, 1, 4, 2, 0, 0, 2, 0, 0, 1, 1, …
$ phq4_sum_rec <dbl> 3, 3, 4, 9, 1, 2, 0, 4, 3, 4, 4, 0, 3, 2, 5, 8, 4, 0, 0, 4, 0, 2, 2, 2, …
$ phq4_cat <chr> "Mild (3–5)", "Mild (3–5)", "Mild (3–5)", "Severe (9–12)", "Normal (0–2)…
$ depression_screen <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ anxiety_screen <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
# Sanity check: View the range of the recoded variables
base::table(df_merged$phq2_sum_rec, useNA = "always")
0 1 2 3 4 5 6 <NA>
4044 6610 21207 1987 2789 891 2456 29424
0 1 2 3 4 5 6 <NA>
4500 8337 18360 1861 2634 1130 3162 29424
0 1 2 3 4 5 6 7 8 9 10 11 12 <NA>
1958 2617 4413 6396 13785 1936 2304 1135 1600 794 1046 496 1504 29424
# Sanity check: Is there mismatch missing values between the two screenings?
dplyr::count(df_merged, depression_screen, anxiety_screen, name = "n")# A tibble: 5 × 3
depression_screen anxiety_screen n
<dbl> <dbl> <int>
1 0 0 28651
2 0 1 3210
3 1 0 2546
4 1 1 5577
5 NA NA 29424
Life Satisfaction
Income, Assets, and Debts
The open text field contained a Qualtrics validation that forced participants to answer only with digits [0-9], commas, and periods. However, a small amount of participants managed to enter values beyond this validation (e.g., including percentage signs, letters, or other characters).
For the countries where digits 0-9 are not the default numeric keypad, the translations included instructions requesting that participants use only digits 0-9 (Algeria, Bahrain, Chad, Egypt, Kuwait, Morocco, Oman, Saudi Arabia, UAE, Lebanon, Qatar).
Clean numbers
# Sanity check: View the counts of each option
# Option 10 is "Specify: [open text field]"
table_label(df_merged$income_orig)$income_orig
Please indicate what your total household income was for 2024 (before taxes). You can select an option or indicate a precise value. If you are retired or live off a pension, please indicate the amount your household received during the year in total payments. - Selected Choice
0 1 2 3 4 5 6 7 8 9 10 <NA>
2450 5532 6495 6707 6593 5832 5160 5042 3933 5890 6974 8800
Class: numeric
[1] "character"
[1] "character"
[1] "character"
# Participants were able to write in a open text field their income, assets, and debts.
head(unique(df_merged$income_text_orig), 20) [1] NA "8000" "15000" "7000" "243000" "124000" "12345678" "150000" "700.000" "10000" "50000" "400" "2500"
[14] "1500" "6000" "300000" "3600" "636000" "643" "435"
[1] "5" "2" "20.000" "1000000" "5000" "0,00" "250000" "1,000.00" "50000" "00" "0" "700000"
[13] "20000000" "600000" "70000000" "1" "100,000" "50000000" "7000000" "350000000"
[1] "10000000" "2" "18000" "0" "125000" "0,00" "1,000.00" "20.000,00" "1500" "200000000"
[11] "90000" "150000" "200000" "1300000" "10,000" "10000" "120000" "66000000" "100,000,000" "200"
# View values that end with "," or "."
df_merged |>
dplyr::filter(grepl("[.,]$", income_text_orig)) |>
dplyr::select(ResponseId, income_text_orig) |>
base::nrow();[1] 0
# A tibble: 18 × 2
ResponseId assets_orig
<chr> <chr>
1 R_2ilYHj1poprgCX8 1,00,000,
2 R_2Iaw1PAzIm22N4f 1,500,000.
3 R_7Xai7kgm6ni70up 1000000.
4 R_3Ezenl8l5Vbehqq 650000.
5 R_8EouiGcGN3SO3RO 200000.
6 R_7QEIOPC6sqjQ7jF 300000,
7 R_8NwCae5exBdtR98 600,000.
8 R_3wBLehUhjYbWTbq 100000.
9 R_9d3Tm1Wu2M6gFoh 0,
10 R_2E6m2ErMvEOb0wx 4000.
11 R_6elc9peo8vxbMVH 3000.
12 R_7rYZllIWkzJDJ2X 10.
13 R_9GHpnvrI5tXbopH 350000.
14 R_7sbQ258PDYZf45A 2,000000.
15 R_1Hc7FpY3tW9nsh7 500,000.
16 R_9dhgf8xvk6Ib8LX 100000.
17 R_8eOIl90Z2J6iB6k 5,000,000.
18 R_8CSIx79Mqkq1qaB 600,000,000,
# A tibble: 6 × 2
ResponseId debts_orig
<chr> <chr>
1 R_7QEIOPC6sqjQ7jF 23000,
2 R_1wuhfjwEOnWp9AS 0.
3 R_16SQZLnjugK3f6p 0.
4 R_5bW2dvfC8MaUgLB 5,00.
5 R_8CB0K2YQUfWxGY1 0.
6 R_5xPMFVkMda7RhuS 18000.
# Create function to clean numbers
clean_number <- function(i) {
parse_one <- function(s) {
# Keep NA as NA
if (is.na(s))
return(NA_real_)
# Remove leading/trailing spaces
s <- stringr::str_trim(s)
# first character must be a digit, otherwise NA
if (!stringr::str_detect(s, "^[0-9]"))
return(NA_real_)
# If contains "%" or "x", set to NA
if (stringr::str_detect(s, "%") || stringr::str_detect(s, "[xX]"))
return(NA_real_)
# Handle scientific notation ( if e/E is present)
if (stringr::str_detect(s, "[eE]")) {
s_sci <- s |>
stringr::str_replace_all(",", ".") |>
stringr::str_replace_all("[^0-9eE+\\-\\.]", "")
val <- as.numeric(s_sci)
return(val)
}
# Remove non-numeric characters (except "." and ",")
s <- stringr::str_remove_all(s, "[^0-9,\\.]")
# Allow "0"
if (s == "0")
return(0)
# Place values of 0.0 / 0.00 / 0,0 / 0,00 / 0,000 as 0
if (stringr::str_detect(s, "^0[\\.,]0{1,3}$"))
return(0)
# Otherwise, anything else starting with 0 and longer than 1 char -> NA
# For example: "007", "01", "0.7", "0,7", "0.000", "0,000", "0002"
if (stringr::str_detect(s, "^0") && base::nchar(s) > 1)
return(NA_real_)
# Remove "." or "," at the very end
# For example: "1.000.000." -> "1.000.000"
s <- stringr::str_replace(s,"[,\\.]$","")
# Identify last occurrence of "," or "." as decimal separator
# Some countries use "," as decimal separator and others use "."
m <- stringr::str_match(s, "([,\\.])([0-9]*)$")
if (!is.na(m[1])) {
# Count the number of digits after the last separator
sep <- m[2]
digits_after <- m[3]
len <- base::nchar(digits_after)
if (len >= 3) {
# Thousands separator, remove all separators
# For example: "1.000.000" -> "1000000"
s <- stringr::str_remove_all(s, "[,.]")
} else {
# Decimal, keep only last separator as decimal
# Remove all other separators
# For example: "1.000.000.00" -> "1000000.00"
# "1,000,000,00" -> "1000000,00"
s_wo_last <- stringr::str_sub(s, 1, nchar(s) - len - 1)
s_wo_last <- stringr::str_remove_all(s_wo_last, "[,.]")
# This R session uses "." as decimal separator,
# so we need to convert accordingly
# For example: "1000000,00" -> "1000000.00"
s <- paste0(s_wo_last, ".", digits_after)
}
}
# In R, numerical values have 53 bits of precision (9.0e15),
# so very large numbers that exceed R's numeric limit will be rounded
# to the nearest representable double.
# For example, as.numeric("9999999999999999999") returns 10000000000000002048.
as.numeric(s)
}
vapply(i, parse_one, numeric(1))
}
# Sanity check:
clean_number(c(",1", "0.1", "0,75", "1%", "1000", "1000000,00", "1.000",
"1,00,000", "1.000.000.00", "1.000.000.", "0010", "10x", "7e-1",
"9999999999999999999", "0", "0.0", "0,0", "0.00", "0,00", "07",
"0.7", "0,7", "00", "00,00", "00.00")) ,1 0.1 0,75 1% 1000 1000000,00
NA NA NA NA 1000.0 1000000.0
1.000 1,00,000 1.000.000.00 1.000.000. 0010 10x
1000.0 100000.0 1000000.0 1000000.0 NA NA
7e-1 9999999999999999999 0 0.0 0,0 0.00
0.7 10000000000000002048.0 0.0 0.0 0.0 0.0
0,00 07 0.7 0,7 00 00,00
0.0 NA NA NA NA NA
00.00
NA
# Apply function to the values in open text fields
df_merged <- df_merged |>
dplyr::mutate(
income_text_clean = clean_number(income_text_orig),
assets_clean = clean_number(assets_orig),
debts_clean = clean_number(debts_orig)) |>
dplyr::relocate(income_text_clean, .after = income_text_orig) |>
dplyr::relocate(assets_clean, .after = assets_orig) |>
dplyr::relocate(debts_clean, .after = debts_orig)
# Sanity check: View changes between original and cleaned income text
df_merged |>
dplyr::mutate(
income_text_clean = as.character(income_text_clean),
n_digits_orig = stringr::str_count(income_text_orig, "[0-9]"),
n_digits_clean = stringr::str_count(income_text_clean, "[0-9]")
) |>
dplyr::filter(income_text_clean != income_text_orig) |>
dplyr::select(
ResponseId,
income_text_orig,
income_text_clean,
n_digits_orig,
n_digits_clean
) |>
print_reactable(sorted_col = "income_text_orig", width = 800)# Sanity check: View changes between original and cleaned assets text
df_merged |>
dplyr::mutate(
assets_clean = as.character(assets_clean),
debts_clean = as.character(debts_clean),
n_digits_orig = stringr::str_count(assets_orig, "[0-9]"),
n_digits_clean = stringr::str_count(assets_clean, "[0-9]")
) |>
dplyr::filter(assets_clean != assets_orig) |>
dplyr::select(
ResponseId,
assets_orig,
assets_clean,
n_digits_orig,
n_digits_clean
) |>
print_reactable(sorted_col = "assets_orig", width = 800)# Sanity check: View the new cleaned variables
df_merged |>
dplyr::select(
income_orig,
income_text_orig,
income_text_clean,
assets_orig,
assets_clean,
debts_orig,
debts_clean
) |>
dplyr::glimpse(width = 150)Rows: 69,408
Columns: 7
$ income_orig <dbl> 7, 9, 6, 9, 7, 8, 5, 4, 5, 7, 1, 5, 4, 6, 7, 6, 10, 10, 6, 3, 9, 7, 2, 3, 3, 1, 9, 10, 4, 4, 4, 8, 8, 9, 9, 10, 2, 3, 10, …
$ income_text_orig <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "8000", "15000", NA, NA, NA, NA, NA, NA, NA, NA, NA, "7000…
$ income_text_clean <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8000, 15000, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7000, NA,…
$ assets_orig <chr> "5", "2", "20.000", "1000000", "5000", "0,00", "250000", "1,000.00", "50000", "00", "0", "1000000", "0", "700000", "200000…
$ assets_clean <dbl> 5, 2, 20000, 1000000, 5000, 0, 250000, 1000, 50000, NA, 0, 1000000, 0, 700000, 20000000, 0, 0, 600000, 70000000, 1, 0, 0, …
$ debts_orig <chr> "10000000", "2", "18000", "0", "125000", "0,00", "0", "1,000.00", "0", "20.000,00", "1500", "0", "0", "0", "200000000", "0…
$ debts_clean <dbl> 10000000, 2, 18000, 0, 125000, 0, 0, 1000, 0, 20000, 1500, 0, 0, 0, 200000000, 0, 90000, 0, 0, 0, 150000, 0, 200000, 0, 0,…
# Sanity check: Count missing values in cleaned variables
dplyr::summarise(df_merged,
n_income_orig_text = sum(!is.na(income_text_orig)),
n_income_text_clean_na = sum(is.na(income_text_clean) & !is.na(income_text_orig)),
n_assets_orig = sum(!is.na(assets_orig)),
n_assets_clean_na = sum(is.na(assets_clean) & !is.na(assets_orig)),
n_debts_orig = sum(!is.na(debts_orig)),
n_debts_clean_na = sum(is.na(debts_clean) & !is.na(debts_orig))
)# A tibble: 1 × 6
n_income_orig_text n_income_text_clean_na n_assets_orig n_assets_clean_na n_debts_orig n_debts_clean_na
<int> <int> <int> <int> <int> <int>
1 6973 0 56550 861 56027 909
# Sanity check: View the new cleaned variables
# View values that contain non-numeric characters besides "." and ","
df_merged |>
dplyr::filter(!stringr::str_detect(income_text_orig, "^[0-9,\\.]+$") &
!is.na(income_orig)) |> select(income_text_orig, income_text_clean) |>
base::nrow();[1] 0
df_merged |>
dplyr::filter(!stringr::str_detect(assets_orig, "^[0-9,\\.]+$") &
!is.na(assets_orig)) |> select(assets_orig, assets_clean);# A tibble: 17 × 2
assets_orig assets_clean
<chr> <dbl>
1 -0 NA
2 +10000 NA
3 40% NA
4 4.5e7 45000000
5 1.78e10 17800000000
6 -0 NA
7 +80.000 NA
8 6.265e9 6265000000
9 -0 NA
10 0x0 NA
11 1.3425e10 13425000000
12 10 % NA
13 10% NA
14 2% NA
15 30% NA
16 0% NA
17 +1000000 NA
df_merged |>
dplyr::filter(!stringr::str_detect(debts_orig, "^[0-9,\\.]+$") &
!is.na(debts_orig)) |> select(debts_orig, debts_clean)# A tibble: 10 × 2
debts_orig debts_clean
<chr> <dbl>
1 7% NA
2 50% NA
3 8.95e8 895000000
4 5% NA
5 , 60000 NA
6 10% NA
7 10% NA
8 20% NA
9 ,0% NA
10 +4000 NA
Add financial country-level values
fin_values <-
readr::read_csv("111_country_variables.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 126
Columns: 23
$ country <chr> "Albania", "Algeria", "Angola", "Argentina", "Armenia", "…
$ language <chr> "Albanian", "Arabic", "Portuguese", "Spanish", "Armenian"…
$ UserLanguage <chr> "SQI-ALB", "AR-DZA", "PT-AGO", "ES-ARG", "AM-ARM", "EN-AU…
$ income_period <chr> "monthly", "monthly", "monthly", "monthly", "monthly", "a…
$ income_type <chr> "gross", "gross", "gross", "gross", "gross", "gross", "ne…
$ income_year <dbl> 2024, 2025, 2024, 2024, 2024, 2025, 2024, 2024, 2024, 202…
$ income_currency <chr> "lek", "د.ج", "Kz", "$ (peso)", "ՀՀ դրամ", "AU$", "€", "د…
$ income_currency_position <chr> "left", "right", "left", "right", "left", "right", "left"…
$ income_cutoff_min <dbl> 12000, 10000, 50000, 250000, 15000, 40000, 14508, 200, 20…
$ assets_cutoff_min <dbl> 1000, 10000, 0, 100, 10000, 500, 0, 1000, 1000, 100, 100,…
$ debts_cutoff_min <dbl> 1000, 1000, 10000, 100, 10000, 0, 0, 100, 100, 10, 100, 1…
$ assets_upper_limit <dbl> 40000001, 100000000, NA, 350000000, 50000000, 30000000, 2…
$ debts_upper_limit <dbl> 50000001, 300000000, NA, 350000000, 50000000, 3000000, 10…
$ wages_per_year <dbl> 12, 12, 13, 13, 12, NA, NA, NA, 12, NA, 13, 13, 13, 12, N…
$ inflation2024_factor <dbl> NA, 1.0010, NA, NA, NA, 1.0182, NA, NA, NA, NA, NA, NA, N…
$ one_local_unit_to_USD_conversion <dbl> 0.010738447, 0.007728573, 0.001149628, 0.001093261, 0.002…
$ one_USD_to_local_unit_conversion <dbl> 93.123, 129.390, 869.846, 914.695, 392.730, 1.531, 0.924,…
$ country_region <chr> "Europe & Central Asia", "Middle East, North Africa, Afgh…
$ continent <chr> "Europe", "MENA", "Africa", "South America", "Europe", "O…
$ country_incomegroup <chr> "Upper middle income", "Upper middle income", "Lower midd…
$ soft_launch <chr> "June 2", "June 7", "June 2", "June 2", "June 2", "June 5…
$ target_size <dbl> 300, 600, 600, 600, 300, 600, 300, 300, 300, 1200, 600, 6…
$ comment_country <chr> NA, "Collaborator said that the household income values a…
[1] 69408
df_merged <- df_merged |>
dplyr::left_join(dplyr::select(fin_values, -country), by = "UserLanguage")
# Sanity check: Number of rows should remain the same
nrow(df_merged)[1] 69408
Create categorical variables
# Add categorical variable
df_merged <- df_merged |>
dplyr::mutate(
# Considers all options
income_orig_cat_11 =
dplyr::case_when(
income_orig == 0 ~ "No income",
income_orig == 1 ~ "Second decile",
income_orig == 2 ~ "Third decile",
income_orig == 3 ~ "Fourth decile",
income_orig == 4 ~ "Fifth decile",
income_orig == 5 ~ "Sixth decile",
income_orig == 6 ~ "Seventh decile",
income_orig == 7 ~ "Eighth decile",
income_orig == 8 ~ "Ninth decile",
income_orig == 9 ~ "Tenth decile",
income_orig == 10 ~ "Specify",
TRUE ~ NA_character_
),
# Only considers the first 10 options and gives NA to "Specify"
income_orig_cat_10 =
dplyr::case_when(
income_orig == 0 ~ "No income",
income_orig == 1 ~ "Second decile",
income_orig == 2 ~ "Third decile",
income_orig == 3 ~ "Fourth decile",
income_orig == 4 ~ "Fifth decile",
income_orig == 5 ~ "Sixth decile",
income_orig == 6 ~ "Seventh decile",
income_orig == 7 ~ "Eighth decile",
income_orig == 8 ~ "Ninth decile",
income_orig == 9 ~ "Tenth decile",
TRUE ~ NA_character_
)
) |>
dplyr::relocate(income_orig_cat_11, income_orig_cat_10, .after = income_orig)
# Sanity check: View the mapping distribution of the new income variables
df_merged |> dplyr::count(income_orig, income_orig_cat_11)# A tibble: 12 × 3
income_orig income_orig_cat_11 n
<dbl> <chr> <int>
1 0 No income 2450
2 1 Second decile 5532
3 2 Third decile 6495
4 3 Fourth decile 6707
5 4 Fifth decile 6593
6 5 Sixth decile 5832
7 6 Seventh decile 5160
8 7 Eighth decile 5042
9 8 Ninth decile 3933
10 9 Tenth decile 5890
11 10 Specify 6974
12 NA <NA> 8800
# A tibble: 12 × 3
income_orig income_orig_cat_10 n
<dbl> <chr> <int>
1 0 No income 2450
2 1 Second decile 5532
3 2 Third decile 6495
4 3 Fourth decile 6707
5 4 Fifth decile 6593
6 5 Sixth decile 5832
7 6 Seventh decile 5160
8 7 Eighth decile 5042
9 8 Ninth decile 3933
10 9 Tenth decile 5890
11 10 <NA> 6974
12 NA <NA> 8800
Add income bracket information
Country-specific adjustments were applied for an efficient mapping. For example, due to the phrasing, some countries had overlapping values in the brackets: if the last bracket was “more than 4500” and 4500 was the same as the low point of the previous bracket.
# Load the income bracket information and apply country-specific adjustments.
income_recoded <- base::readRDS("111_income_recoded.rds") |>
dplyr::mutate(
income_lowpoint =
dplyr::case_when(
# Correct Mongolia's income bracket error. Where it reads
# "₮1,700,001 – ₮2,000,00" should be "₮1,700,001 – ₮2,000,000".
# Any reasonable person would be able to spot that,
# if they even noticed it.
UserLanguage %in% c("MN-MNG", "EN-MNG") & income_orig == 7 ~ 1700001,
# Qatar 5th bracket: AR-QAT: [150000-250000]; EN-QAT: [150001-250000]
UserLanguage == "AR-QAT" & income_orig == 5 ~ 150001,
# Correct the third bracket in Morocco because it reads "Around 2,500 dirhams
# per month" in the middle of the deciles.
UserLanguage == "AR-MAR" & income_orig == 2 ~ 1500,
TRUE ~ income_lowpoint
),
income_highpoint =
dplyr::case_when(
UserLanguage %in% c("MN-MNG", "EN-MNG") & income_orig == 7 ~ 2000000,
# Correct the third bracket in Morocco because it reads "Around 2,500 dirhams
# per month" in the middle of the deciles.
UserLanguage == "AR-MAR" & income_orig == 2 ~ 2500,
# Correct Uzbekistan's income brackets so the highpoint of each decile
# matches the lowpoint of the next decile (e.g., coding 14.9 mln as 14999999
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 3 ~ 4999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 4 ~ 9999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 5 ~ 14999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 6 ~ 19999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 7 ~ 24999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 8 ~ 29999999,
# Georgia 2nd bracket: KA-GEO 0-500; EN-GEO 0-550.
UserLanguage == "KA-GEO" & income_orig == 1 ~ 550,
# Kyrgyzstan 8th bracket: KY-KGZ [100000-119000]; RU-KGZ [100000-119999].
UserLanguage == "KY-KGZ" & income_orig == 8 ~ 119999,
# Ar-TCD 3rd bracket overlaps with the 2nd bracket and do not match with
# FR-TCD's 3rd bracket.
UserLanguage == "AR-TCD" & income_orig == 3 ~ 3000000,
TRUE ~ income_highpoint
),
)
# Sanity check: Any country have the different brackets across languages?
# We expect to only have differences between Ireland's sponsored and main versions.
income_recoded |>
dplyr::group_by(country, income_orig) |>
dplyr::summarise(
n_lang = dplyr::n_distinct(UserLanguage),
n_brackets = dplyr::n_distinct(
paste(income_lowpoint, income_highpoint)
),
bracket_defs = paste0(
UserLanguage, ": [", income_lowpoint, "-", income_highpoint, "]",
collapse = "; "
),
.groups = "drop"
) |> dplyr::filter(n_lang > 1, n_brackets > 1)# A tibble: 9 × 5
country income_orig n_lang n_brackets bracket_defs
<chr> <int> <int> <int> <chr>
1 Ireland 1 2 2 EN-IRL: [0-17500]; EN-IRL-sponsored: [0-22000]
2 Ireland 2 2 2 EN-IRL: [17500-24999]; EN-IRL-sponsored: [22001-32000]
3 Ireland 3 2 2 EN-IRL: [25000-34999]; EN-IRL-sponsored: [32001-42000]
4 Ireland 4 2 2 EN-IRL: [35000-49999]; EN-IRL-sponsored: [42001-55000]
5 Ireland 5 2 2 EN-IRL: [50000-74999]; EN-IRL-sponsored: [55001-67000]
6 Ireland 6 2 2 EN-IRL: [75000-99999]; EN-IRL-sponsored: [67001-85000]
7 Ireland 7 2 2 EN-IRL: [100000-149999]; EN-IRL-sponsored: [85001-105000]
8 Ireland 8 2 2 EN-IRL: [150000-200000]; EN-IRL-sponsored: [105001-137000]
9 Ireland 9 2 2 EN-IRL: [200000-NA]; EN-IRL-sponsored: [137000-NA]
# Correct gaps between brackets
income_gaps <- income_recoded |>
dplyr::group_by(UserLanguage) |>
dplyr::arrange(income_orig, .by_group = TRUE) |>
# First check lowpoints
dplyr::mutate(
prev_high = dplyr::lag(income_highpoint),
expected_low = prev_high + 1,
has_gap = income_orig >= 2 &
income_orig <= 8 &
!is.na(prev_high) &
!is.na(income_lowpoint) &
income_lowpoint != expected_low,
income_lowpoint_adj = dplyr::if_else(
has_gap,
expected_low,
income_lowpoint
),
# Then highpoints
next_low = dplyr::lead(income_lowpoint_adj),
expected_high = next_low - 1L,
high_needs_fix =
income_orig >= 2 &
income_orig <= 8 &
!is.na(next_low) &
!is.na(income_highpoint) &
income_highpoint != expected_high,
income_highpoint_adj = dplyr::if_else(
high_needs_fix,
expected_high,
income_highpoint
)
) |>
dplyr::ungroup()
# Sanity check: View languages where there is a gap
income_gaps |>
dplyr::filter(has_gap) |>
dplyr::select(
UserLanguage,
income_orig,
prev_high,
income_lowpoint,
expected_low
) |>
print_reactable(sorted_col = "UserLanguage", width = 800)# Transform income_recoded into a wider format for merging
income_info <- income_gaps |>
dplyr::select(UserLanguage, income_orig,
income_lowpoint, income_lowpoint_adj,
income_highpoint, income_highpoint_adj) |>
tidyr::pivot_longer(
cols = c(income_lowpoint, income_lowpoint_adj,
income_highpoint, income_highpoint_adj),
names_to = "bound",
values_to = "value"
) |>
tidyr::pivot_wider(
names_from = c(bound, income_orig),
values_from = value,
names_sep = "_"
) |> dplyr::glimpse(width = 100)Rows: 125
Columns: 37
$ UserLanguage <chr> "AM-ARM", "AM-ETH", "AR-ARE", "AR-BHR", "AR-DZA", "AR-EGY", "AR-KWT…
$ income_lowpoint_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ income_lowpoint_adj_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ income_highpoint_1 <dbl> 24000, 600, 60000, 300, 15000, 70000, 500, 18000000, 1500, 499, 500…
$ income_highpoint_adj_1 <dbl> 24000, 600, 60000, 300, 15000, 70000, 500, 18000000, 1500, 499, 500…
$ income_lowpoint_2 <dbl> 24001, 601, 60000, 301, 15000, 70001, 500, 18000000, 1500, 500, 500…
$ income_lowpoint_adj_2 <dbl> 24001, 601, 60001, 301, 15001, 70001, 501, 18000001, 1501, 500, 500…
$ income_highpoint_2 <dbl> 48000, 1200, 119999, 600, 24999, 200000, 999, 30000000, 2500, 999, …
$ income_highpoint_adj_2 <dbl> 48000, 1200, 119999, 600, 24999, 200000, 999, 30000000, 2500, 999, …
$ income_lowpoint_3 <dbl> 48001, 1201, 120000, 601, 25000, 200001, 1000, 30000000, 2500, 1000…
$ income_lowpoint_adj_3 <dbl> 48001, 1201, 120000, 601, 25000, 200001, 1000, 30000001, 2501, 1000…
$ income_highpoint_3 <dbl> 120000, 1800, 179999, 900, 34999, 400000, 1499, 60000000, 4000, 149…
$ income_highpoint_adj_3 <dbl> 120000, 1800, 179999, 900, 34999, 400000, 1499, 60000000, 4000, 149…
$ income_lowpoint_4 <dbl> 120001, 1801, 180000, 901, 35000, 400001, 1500, 60000000, 4000, 150…
$ income_lowpoint_adj_4 <dbl> 120001, 1801, 180000, 901, 35000, 400001, 1500, 60000001, 4001, 150…
$ income_highpoint_4 <dbl> 192000, 2400, 239999, 1200, 49999, 600000, 1999, 90000000, 6000, 19…
$ income_highpoint_adj_4 <dbl> 192000, 2400, 239999, 1200, 49999, 600000, 1999, 90000000, 6000, 19…
$ income_lowpoint_5 <dbl> 192000, 2401, 240000, 1201, 50000, 600001, 2000, 90000000, 6000, 20…
$ income_lowpoint_adj_5 <dbl> 192001, 2401, 240000, 1201, 50000, 600001, 2000, 90000001, 6001, 20…
$ income_highpoint_5 <dbl> 383000, 3000, 319999, 1500, 74999, 800000, 2999, 120000000, 8000, 2…
$ income_highpoint_adj_5 <dbl> 383000, 3000, 319999, 1500, 74999, 800000, 2999, 120000000, 8000, 2…
$ income_lowpoint_6 <dbl> 383001, 3001, 320000, 1501, 75000, 800001, 3000, 120000000, 8000, 2…
$ income_lowpoint_adj_6 <dbl> 383001, 3001, 320000, 1501, 75000, 800001, 3000, 120000001, 8001, 2…
$ income_highpoint_6 <dbl> 575000, 5000, 399999, 1800, 99999, 1200000, 3999, 150000000, 10000,…
$ income_highpoint_adj_6 <dbl> 575000, 5000, 399999, 1800, 99999, 1200000, 3999, 150000000, 10000,…
$ income_lowpoint_7 <dbl> 575001, 5001, 400000, 1801, 100000, 1200001, 4000, 150000000, 10000…
$ income_lowpoint_adj_7 <dbl> 575001, 5001, 400000, 1801, 100000, 1200001, 4000, 150000001, 10001…
$ income_highpoint_7 <dbl> 960000, 10000, 499999, 2000, 149999, 2400000, 4999, 200000000, 1250…
$ income_highpoint_adj_7 <dbl> 960000, 10000, 499999, 2000, 149999, 2400000, 4999, 200000000, 1250…
$ income_lowpoint_8 <dbl> 960000, 10001, 500000, 2001, 150000, 2400001, 5000, 200000000, 1250…
$ income_lowpoint_adj_8 <dbl> 960001, 10001, 500000, 2001, 150000, 2400001, 5000, 200000001, 1250…
$ income_highpoint_8 <dbl> 1200000, 20000, 699999, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_highpoint_adj_8 <dbl> 1200000, 19999, 699999, 2299, 199999, 4799999, 5999, 299999999, 149…
$ income_lowpoint_9 <dbl> 1200001, 20000, 700000, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_lowpoint_adj_9 <dbl> 1200001, 20000, 700000, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_highpoint_9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ income_highpoint_adj_9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
[1] 69408
df_merged <- df_merged |>
dplyr::left_join(income_info, by = "UserLanguage") |>
dplyr::relocate(income_lowpoint_1:income_highpoint_adj_9,
.after = income_orig_cat_10
)
# Sanity check
nrow(df_merged)[1] 69408
Identify strange numbers in income, assets, and debts
# Create function to identify strange numbers.
weird_nr <- function(i) {
# Temporary transform into a character vector so we can use stringr functions
s <- as.character(i)
# Flag numbers with the same non-zero digit repeated >=4 (e.g., 1111, 9999)
# except for zeros.
rep4 <-
stringr::str_detect(s, "(?:1111|2222|3333|4444|5555|6666|7777|8888|9999)")
# Flag sequential numbers of length >= 3 ascending or descending
# (e.g., 123, 1234, 4321)
asc3 <- stringr::str_detect(s, "(?:123|234|345|456|567|678|789)")
desc3 <- stringr::str_detect(s, "(?:321|432|543|654|765|876|987)")
# Flag repeated 2-digit blocks (e.g., 3939, 1212, 4545)
repeat2 <- stringr::str_detect(s, "(?!0{2})(\\d{2})\\1+")
# Combine all flags and check if any is TRUE
outcome <- (rep4 | asc3 | desc3 | repeat2)
# Make NAs as not weird
outcome[is.na(outcome)] <- FALSE
outcome
}
# Sanity check:
weird_nr(c(999999, 12340, 43210, 3939, 540000, 75000, NA))[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# Apply function to financial variables
df_merged <- df_merged |>
dplyr::mutate(
income_wrd = weird_nr(income_text_clean) |
# Also detect rows where original text exists but cleaning is NA
(!is.na(income_text_orig) & is.na(income_text_clean)),
assets_wrd = weird_nr(assets_clean) |
(!is.na(assets_orig) & is.na(assets_clean)),
debts_wrd = weird_nr(debts_clean) |
(!is.na(debts_orig) & is.na(debts_clean))
) |>
relocate(income_wrd, .after = income_text_clean) |>
relocate(assets_wrd, .after = assets_clean) |>
relocate(debts_wrd, .after = debts_clean)
# Sanity check: View the counts of weird numbers per variable
base::table(df_merged$income_wrd, useNA = "always")
FALSE TRUE <NA>
69353 55 0
FALSE TRUE <NA>
68386 1022 0
FALSE TRUE <NA>
68411 997 0
# Sanity check: View changes between original and cleaned income text
df_merged |>
dplyr::mutate(
income_text_clean = as.character(income_text_clean),
n_digits_orig = stringr::str_count(income_text_orig, "[0-9]"),
n_digits_clean = stringr::str_count(income_text_clean, "[0-9]")
) |>
dplyr::filter(
income_text_clean != income_text_orig |
(!is.na(income_text_orig) & is.na(income_text_clean))) |>
dplyr::group_by(
income_text_orig,
income_text_clean,
income_wrd,
n_digits_orig,
n_digits_clean
) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "income_text_orig", width = 800)# Sanity check: View changes between original and cleaned assets text
df_merged |>
dplyr::mutate(
assets_clean = as.character(assets_clean),
debts_clean = as.character(debts_clean),
n_digits_orig = stringr::str_count(assets_orig, "[0-9]"),
n_digits_clean = stringr::str_count(assets_clean, "[0-9]")
) |>
dplyr::filter(
assets_clean != assets_orig | (!is.na(assets_orig) & is.na(assets_clean))) |>
dplyr::group_by(
assets_orig,
assets_clean,
assets_wrd,
n_digits_orig,
n_digits_clean
) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "assets_orig", width = 800)# Sanity check: View changes between original and cleaned debts text
df_merged |>
dplyr::mutate(
debts_clean = as.character(debts_clean),
n_digits_orig = stringr::str_count(debts_orig, "[0-9]"),
n_digits_clean = stringr::str_count(debts_clean, "[0-9]")
) |>
dplyr::filter(
debts_clean != debts_orig | (!is.na(debts_orig) & is.na(debts_clean)
)) |>
dplyr::group_by(
debts_orig,
debts_clean,
debts_wrd,
n_digits_orig,
n_digits_clean
) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "debts_orig", width = 500)# Sanity check: Count missing values in cleaned variables
dplyr::summarise(df_merged,
n_income_orig_text = sum(!is.na(income_text_orig)),
n_income_text_clean_na = sum(is.na(income_text_clean) & !is.na(income_text_orig)),
n_assets_orig = sum(!is.na(assets_orig)),
n_assets_clean_na = sum(is.na(assets_clean) & !is.na(assets_orig)),
n_debts_orig = sum(!is.na(debts_orig)),
n_debts_clean_na = sum(is.na(debts_clean) & !is.na(debts_orig))
)# A tibble: 1 × 6
n_income_orig_text n_income_text_clean_na n_assets_orig n_assets_clean_na n_debts_orig n_debts_clean_na
<int> <int> <int> <int> <int> <int>
1 6973 0 56550 861 56027 909
# Sanity check: View the rows with NA in cleaned values
# but original text exists
df_merged |> dplyr::group_by(income_text_orig, income_text_clean, income_wrd) |>
dplyr::filter(!is.na(income_text_orig) & is.na(income_text_clean)) |>
dplyr::summarise(n = dplyr::n()) |> base::nrow()[1] 0
df_merged |> dplyr::group_by(assets_orig, assets_clean, assets_wrd) |>
dplyr::filter(!is.na(assets_orig) & is.na(assets_clean)) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "assets_orig", width = 500)df_merged |> dplyr::group_by(debts_orig, debts_clean, debts_wrd) |>
dplyr::filter(!is.na(debts_orig) & is.na(debts_clean)) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "debts_orig", width = 500)Household Size
$household_size
How many people in your household are covered by these finances? Put 1 if you live alone, or if you live with others (e.g., roommates) but are financially independent from them and vice-versa, put 1. Otherwise, list the total number of people living with you that are part of household finances (both incomes and expenses).
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 <NA>
15591 15309 10311 9886 4984 2109 948 596 243 312 71 73 31 29 48 21 8 16 17 115 8690
Class: numeric
df_merged <- df_merged |>
dplyr::mutate(
household_size_group = factor(
dplyr::case_when(
household_size == 1 ~ "1",
household_size == 2 ~ "2",
household_size == 3 ~ "3",
household_size %in% c(4, 5) ~ "4-5",
household_size >= 6 ~ "6-20",
TRUE ~ NA_character_
),
levels = c("1", "2", "3", "4-5", "6-20"),
ordered = TRUE
)
) |>
dplyr::relocate(household_size_group, .after = household_size)
# Sanity check: View the mapping distribution of the new household size variable
base::table(df_merged$household_size, df_merged$household_size_group, useNA = "always")
1 2 3 4-5 6-20 <NA>
1 15591 0 0 0 0 0
2 0 15309 0 0 0 0
3 0 0 10311 0 0 0
4 0 0 0 9886 0 0
5 0 0 0 4984 0 0
6 0 0 0 0 2109 0
7 0 0 0 0 948 0
8 0 0 0 0 596 0
9 0 0 0 0 243 0
10 0 0 0 0 312 0
11 0 0 0 0 71 0
12 0 0 0 0 73 0
13 0 0 0 0 31 0
14 0 0 0 0 29 0
15 0 0 0 0 48 0
16 0 0 0 0 21 0
17 0 0 0 0 8 0
18 0 0 0 0 16 0
19 0 0 0 0 17 0
20 0 0 0 0 115 0
<NA> 0 0 0 0 0 8690
Birth Year and Age
Participants were able to write in an open text field their birth year, and the validation required values between 1925 and 2007, except for Iran (FA-IRN), where the validation ranged from 1304 to 1386.
[1] "character"
[1] "numeric"
# Sanity check: View values with non-numeric characters
df_merged |>
dplyr::filter(!is.na(birth_year_orig) & grepl("\\D", birth_year_orig)) |>
dplyr::select(birth_year_orig) |>
dplyr::distinct() |>
base::print(n = Inf)# A tibble: 41 × 1
birth_year_orig
<chr>
1 2001.
2 1993.
3 2000.
4 2003.
5 2002.
6 1995.
7 2005.
8 1997.
9 1969.
10 1977.
11 2005,
12 1985.
13 1982.
14 1983.
15 2004.
16 1996.
17 1980.
18 1994.
19 2007.
20 1992.
21 2006.
22 1975.
23 1972.
24 1978.
25 1999.
26 1990.
27 1989.
28 1945.
29 1981.
30 1971.
31 1974.
32 1955.
33 1949.
34 1959.
35 1988.
36 ,1979
37 1982.0424
38 ,1953
39 1973.01
40 1963.
41 1951.
# Create cleaned column and keep original.
# Calculate age.
df_merged <- df_merged |>
dplyr::mutate(
# extract first 4-digit sequence and transform to numerical
birth_year_clean =
as.numeric(stringr::str_extract(birth_year_orig, "\\d{4}")),
age = dplyr::case_when(
# Keep the values of the participants from the Irish sponsored dataset
!is.na(age) ~ age,
# If rows in birth year contains NA, then keep NA
is.na(birth_year_clean) & is.na(age) ~ NA_real_,
# If Q_Language is "FA-IRN",
# then use the Solar Hijri calendar (1404)
UserLanguage == "FA-IRN" & !is.na(birth_year_clean) ~ 1404 - birth_year_clean,
# Otherwise, use the Gregorian calendar (2025)
!is.na(birth_year_clean) ~ 2025 - birth_year_clean,
TRUE ~ NA_real_
),
# Create age groups
age_group = base::factor(dplyr::case_when(
age >= 18 & age <= 25 ~ "18-25",
age >= 26 & age <= 44 ~ "26-44",
age >= 45 & age <= 64 ~ "45-64",
age >= 65 & age <= 74 ~ "65-74",
age >= 75 ~ "75+",
TRUE ~ NA_character_
),
levels = c(
"18-25",
"26-44",
"45-64",
"65-74",
"75+"
))
) |>
dplyr::relocate(birth_year_clean:age_group, .after = birth_year_orig)
# Sanity check: View the summary of the cleaned birth year
cat(
"Min: ",
min(df_merged$birth_year_clean, na.rm = TRUE),
"\nMax: ",
max(df_merged$birth_year_clean, na.rm = TRUE),
"\nNA count: ",
sum(is.na(df_merged$birth_year_clean)),
"\nClass: ",
class(df_merged$birth_year_clean)
)Min: 1328
Max: 2007
NA count: 10380
Class: numeric
# Sanity check: Are there rows where raw birth year exists but cleaning failed?
df_merged |>
dplyr::filter(!is.na(birth_year_orig) & is.na(birth_year_clean)) |>
base::nrow()[1] 0
# Sanity check: View the summary of the age variable
cat(
"Min: ",
min(df_merged$age, na.rm = TRUE),
"\nMax: ",
max(df_merged$age, na.rm = TRUE),
"\nNA count: ",
sum(is.na(df_merged$age)),
"\nClass: ",
class(df_merged$age)
)Min: 18
Max: 100
NA count: 9180
Class: numeric
# Sanity check: View the mapping distribution of the new age group variable
base::table(df_merged$age_group, useNA = "ifany")
18-25 26-44 45-64 65-74 75+ <NA>
13674 30944 13098 1988 524 9180
# Sanity check: Are there rows where raw value exists but age group is missing?
df_merged |>
dplyr::filter(!is.na(birth_year_orig) & is.na(age_group)) |>
base::nrow()[1] 0
# Sanity check: View the new birth year and age variables
dplyr::glimpse(df_merged |>
dplyr::select(birth_year_orig,
birth_year_clean,
age,
age_group),
width = 100)Rows: 69,408
Columns: 4
$ birth_year_orig <chr> "1989", "1984", "1971", "1986", "1993", "2005", "1986", "1975", "1995", "…
$ birth_year_clean <dbl> 1989, 1984, 1971, 1986, 1993, 2005, 1986, 1975, 1995, 1963, 1993, 1981, 2…
$ age <dbl> 36, 41, 54, 39, 32, 20, 39, 50, 30, 62, 32, 44, 24, 35, 31, 24, 31, 39, 5…
$ age_group <fct> 26-44, 26-44, 45-64, 26-44, 26-44, 18-25, 26-44, 45-64, 26-44, 45-64, 26-…
# Sanity check: View counts of the sponsored Irish dataset
df_merged |>
dplyr::filter(irl==1) |>
dplyr::group_by(UserLanguage, birth_year_orig, birth_year_clean, age, age_group) |>
dplyr::summarise(n = dplyr::n())# A tibble: 67 × 6
# Groups: UserLanguage, birth_year_orig, birth_year_clean, age [67]
UserLanguage birth_year_orig birth_year_clean age age_group n
<chr> <chr> <dbl> <dbl> <fct> <int>
1 EN-IRL-sponsored <NA> NA 18 18-25 7
2 EN-IRL-sponsored <NA> NA 19 18-25 4
3 EN-IRL-sponsored <NA> NA 20 18-25 12
4 EN-IRL-sponsored <NA> NA 21 18-25 9
5 EN-IRL-sponsored <NA> NA 22 18-25 5
6 EN-IRL-sponsored <NA> NA 23 18-25 9
7 EN-IRL-sponsored <NA> NA 24 18-25 10
8 EN-IRL-sponsored <NA> NA 25 18-25 17
9 EN-IRL-sponsored <NA> NA 26 26-44 11
10 EN-IRL-sponsored <NA> NA 27 26-44 16
# ℹ 57 more rows
# Sanity check: View counts of Iran dataset
df_merged |>
dplyr::filter(UserLanguage == "FA-IRN") |>
dplyr::group_by(UserLanguage, birth_year_orig, birth_year_clean, age, age_group) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(-birth_year_clean) |>
print_reactable(sorted_col = "birth_year_clean", width = 800)# Sanity check: View counts of main dataset
df_merged |>
dplyr::filter(irl == 0 & UserLanguage != "FA-IRN") |>
dplyr::group_by(birth_year_orig, birth_year_clean, age, age_group) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(-birth_year_clean) |>
print_reactable(sorted_col = "birth_year_clean", width = 800)Sex
Upon collaborators’ request, the option “I prefer to use: [open text field]” was hidden from the survey versions in Kuwait (AR-KWT; EN-KWT), Egypt (AR-EGY; EN-EGY), Yemen (AR-YEM; EN-YEM), in Algeria (AR-DZA), in Saudi Arabia (AR-SAU), Chad (AR-TCD; FR-TCD), and Bahrain (AR-BHR; EN-BHR).
$sex_orig
Which best describes you? - Selected Choice
1 2 3 <NA>
23444 36194 549 9221
Class: numeric
# Load recoded values regarding sex because
# some participants wrote "Female" or "Male" in the open text field
sex_recoded <-
readr::read_csv("111_sex_open_answers_recoded.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 544
Columns: 2
$ ResponseId <chr> "R_42tedcZhWdJn9Sk", "R_9hnp095IY8LIkSX", "R_2gvEljyDLXs1Yjm", "R_516n1yU…
$ sex_text_recoded <chr> "Cannot determine", "Cannot determine", "Cannot determine", "Cannot deter…
Cannot determine Female Male Non-binary Other <NA>
137 12 17 344 34 0
df_merged <- df_merged |>
dplyr::left_join(sex_recoded, by = "ResponseId") |>
# create a reviewed numeric coding (1 = Male, 2 = Female, 3 = Other)
dplyr::mutate(
sex_reviewed = dplyr::case_when(
sex_text_recoded == "Female" ~ 2,
sex_text_recoded == "Male" ~ 1,
sex_text_recoded %in% c("Other", "Non-binary") ~ 3,
sex_text_recoded == "Cannot determine" ~ NA_real_,
TRUE ~ sex_orig
),
# categorical factor with explicit levels
sex_reviewed_cat = factor(
dplyr::case_when(
sex_reviewed == 1 ~ "Male",
sex_reviewed == 2 ~ "Female",
sex_reviewed == 3 ~ "Other",
TRUE ~ NA_character_
),
levels = c("Male", "Female", "Other")
),
# binary numeric: 1 = Male, 0 = Female, NA otherwise
sex_binary = dplyr::case_when(
sex_reviewed == 1 ~ 1,
sex_reviewed == 2 ~ 0,
TRUE ~ NA_real_
),
# binary factor
sex_binary_cat = factor(
dplyr::case_when(
sex_binary == 1 ~ "Male",
sex_binary == 0 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
)
) |>
dplyr::relocate(sex_text_recoded:sex_binary_cat, .after = sex_orig)
# Sanity check: Cross-tabs to inspect recoded text vs numeric reviewed code
df_merged |>
dplyr::group_by(sex_reviewed, sex_reviewed_cat, sex_binary, sex_binary_cat) |>
dplyr::summarise(n = dplyr::n(), .groups = "drop")# A tibble: 4 × 5
sex_reviewed sex_reviewed_cat sex_binary sex_binary_cat n
<dbl> <fct> <dbl> <fct> <int>
1 1 Male 1 Male 23461
2 2 Female 0 Female 36206
3 3 Other NA <NA> 383
4 NA <NA> NA <NA> 9358
# Sanity check: Cross-tabs to inspect original values vs numeric reviewed code
table(df_merged$sex_orig, df_merged$sex_reviewed, useNA = "always")
1 2 3 <NA>
1 23444 0 0 0
2 0 36194 0 0
3 17 12 383 137
<NA> 0 0 0 9221
# Sanity check: View the counts of each option
df_merged |>
dplyr::group_by(sex_reviewed, sex_reviewed_cat, sex_binary, sex_binary_cat) |>
dplyr::summarise(n = dplyr::n())# A tibble: 4 × 5
# Groups: sex_reviewed, sex_reviewed_cat, sex_binary [4]
sex_reviewed sex_reviewed_cat sex_binary sex_binary_cat n
<dbl> <fct> <dbl> <fct> <int>
1 1 Male 1 Male 23461
2 2 Female 0 Female 36206
3 3 Other NA <NA> 383
4 NA <NA> NA <NA> 9358
Education Level
The translated education categories of each country will be mapped to a common set of categories. Some countries had a different definition of secondary education, so the mapping will consider if the level is eligible for university entrance or not. The classification of the education levels in each country was agreed upon with the collaborators. The recoded education categories are:
- Less than secondary (not eligible for university entrance)
- Secondary (completed the equivalent to high school, and it is eligible for university entrance)
- Technical (not higher education)
- University (higher education up to a bachelor’s degree)
- Advanced (anything beyond a bachelor’s degree)
Note:
- The team from Ethiopia (AM-ETH and EN-ETH) requested to hide option 7 from their versions of the survey.
- The team from Peru requested to include an option for “Inclusive Education”. Since this applies across several levels, this option was recoded to NA.
$education_orig
Which is the highest level of education you have completed?
1 2 3 4 5 6 7 8 <NA>
328 1084 8515 8206 19982 13762 4459 3773 9299
Class: numeric
# Load the education categories for each country
edu_cat <-
readr::read_csv("111_education_recoded.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 1,024
Columns: 5
$ UserLanguage <chr> "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM"…
$ education_orig <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,…
$ education_cat <chr> "No primary education", "Primary (Elementary/Middle School)", "High …
$ education_recoded_cat <chr> "Less than secondary", "Less than secondary", "Secondary", "Technica…
$ education_recoded <dbl> 1, 1, 2, 3, 4, 4, 5, 5, 1, 1, 2, 3, 4, 5, NA, 5, 1, 1, 2, 3, 4, 5, 5…
# Sanity check: View if there are unexpected values in education_orig
base::table(edu_cat$education_orig, useNA = "always")
1 2 3 4 5 6 7 8 <NA>
128 128 128 128 128 128 128 128 0
# Sanity check: View if the categories match the expected values
# We expect three cells with missing values regarding
# Peru's inclusive education level, and level 7 was hidden for Ethiopia (AM-ETH
# and EN-ETH).
edu_cat |>
dplyr::group_by(education_recoded_cat, education_recoded) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(education_recoded)# A tibble: 6 × 3
# Groups: education_recoded_cat [6]
education_recoded_cat education_recoded n
<chr> <dbl> <int>
1 Less than secondary 1 275
2 Secondary 2 138
3 Technical 3 119
4 University 4 129
5 Advanced 5 360
6 <NA> NA 3
# A tibble: 3 × 5
UserLanguage education_orig education_cat education_recoded_cat education_recoded
<chr> <dbl> <chr> <chr> <dbl>
1 AM-ETH 7 <NA> <NA> NA
2 EN-ETH 7 <NA> <NA> NA
3 ES-PER 5 Inclusive education <NA> NA
# Add the education categories to the main data frame
df_merged <- df_merged |>
dplyr::left_join(
edu_cat |> dplyr::select(
UserLanguage,
education_orig,
education_cat,
education_recoded_cat,
education_recoded
),
by = c("UserLanguage", "education_orig")
) |>
dplyr::mutate(
education_recoded_cat = base::factor(
education_recoded_cat,
levels = c(
"Less than secondary",
"Secondary",
"Technical",
"University",
"Advanced"
),
ordered = TRUE
)
) |>
dplyr::relocate(education_cat:education_recoded, .after = education_orig)
# Sanity check: Are there education values without a corresponding
# education_recoded and education_recoded_cat?
df_merged |>
group_by(
UserLanguage,
education_orig,
education_cat,
education_recoded_cat,
education_recoded
) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::filter(is.na(education_recoded) & !is.na(education_orig))# A tibble: 1 × 6
# Groups: UserLanguage, education_orig, education_cat, education_recoded_cat [1]
UserLanguage education_orig education_cat education_recoded_cat education_recoded n
<chr> <dbl> <chr> <ord> <dbl> <int>
1 ES-PER 5 Inclusive education <NA> NA 31
# Sanity check: Check Irish sponsored dataset
df_merged |>
filter(irl == 1) |>
group_by(
education_irl,
education_orig,
education_cat,
education_recoded_cat,
education_recoded) |> dplyr::summarise(n = dplyr::n())# A tibble: 8 × 6
# Groups: education_irl, education_orig, education_cat, education_recoded_cat [8]
education_irl education_orig education_cat education_recoded_cat education_recoded n
<chr> <dbl> <chr> <ord> <dbl> <int>
1 Degree 6 Degree University 4 355
2 Diploma 5 Diploma Technical 3 185
3 Doctorate 8 Doctorate Advanced 5 11
4 Junior (Inter) Certificate or Equivalent 2 Junior (Inter) Certificate or Equivalent Less than secondary 1 67
5 Leaving Certificate 3 Leaving Certificate Secondary 2 277
6 Less than Junior (Inter) Cert 1 Less than Junior (Inter) Cert Less than secondary 1 13
7 Master's 7 Master's Advanced 5 154
8 Technical or Vocational Certificate 4 Technical or Vocational Certificate Technical 3 138
Employment Status
Upon collaborators’ request, the option “Part-time student” was hidden from the versions KA-GEO and EN-GEO in Georgia and SR-SRB in Serbia. The option “Military service” was hidden from the version JA-JPN in Japan.
During the survey completion, participants were not allowed to select conflicting options:
- Employed full-time and part-time simultaneously.
- Student full-time and part-time simultaneously.
- Employed/working full-time or part-time and not in paid employment simultaneously.
- Military service and not in paid employment simultaneously.
- Military service and retired simultaneously.
- Retired and not in paid employment simultaneously.
- Not in paid employment by choice and looking for work or unable to work due to health/personal reasons simultaneously.
- Looking for work and unable to work due to health/personal reasons simultaneously.
Employment status was recoded using a sequential rule:
Militaryif the military service option was selected.Employed/working full-time (25+ hours per week)if the full-time employment option was selected.Employed/working part-time (less than 25 hours per week)if the part-time employment option was selected.Not in paid employment (looking for work)if the job-seeking option was selected and no conditions above were met.Student non-working (Full or part-time)if the full- or part-time student was selected and no conditions above were met.Not in paid employment (by choice/health)if not working by choice or for health reasons and no conditions above were met.Retiredif the retired option was selected and no conditions above were met.
$employment_orig
Which most accurately describes you at this moment? You may select up to two options in case you fit more than one category.
1 1,3 1,4 1,5 1,6 1,7 1,8 1,9 2 2,3 2,4 2,5 2,6 2,7 2,8 2,9 3 3,5 3,6 4 4,5 4,6 5 6 7
6511 962 984 70 18 328 671 114 1312 1440 628 34 28 113 267 54 30950 217 189 4737 47 182 589 2592 1839
8 9 <NA>
3233 1728 9571
Class: character
# Replace numeric values with descriptive labels
employment_labels <- c(
"1" = "Full-time student",
"2" = "Part-time student",
"3" = "Employed/working full-time (25+ hours per week)",
"4" = "Employed/working part-time (less than 25 hours per week)",
"5" = "Military service",
"6" = "Retired",
"7" = "Not in paid employment (by choice)",
"8" = "Not in paid employment (looking for work)",
"9" = "Not in paid employment (unable to work due to health/personal reasons)")
# Function to recode multiple-choice values
recode_employment <- function(i) {
# If row is NA, return NA
if (is.na(i)) return(NA_character_)
# Split the string by comma and map to labels
codes <- strsplit(i, ",")[[1]]
# Collapse the labels into a single string
paste(employment_labels[trimws(codes)], collapse = "; ")
}
df_merged <- df_merged |>
dplyr::mutate(
# Apply recoding function to create employment_cat variable
# so instead of "2,5", we have "Part-time student; Military service"
employment_cat =
stringr::str_squish(sapply(employment_orig, recode_employment)),
employment_primary = base::factor(
dplyr::case_when(
# Contains option 5
stringr::str_detect(employment_orig, fixed("5"))
~ "Military service",
# Contains option 3 AND do not contain option 5
stringr::str_detect(employment_orig, fixed("3")) &
!(stringr::str_detect(employment_orig, fixed("5")))
~ "Employed/working full-time (25+ hours per week)",
# Contains option 4 AND do not contain option 5
# (it was not possible to select options 3 and 4 simultaneously)
stringr::str_detect(employment_orig, fixed("4")) &
!(stringr::str_detect(employment_orig, fixed("5")))
~ "Employed/working part-time (less than 25 hours per week)",
# Contains option 8 AND do not contain option 5
# (it was not possible to select options 8 and 5, 3 or 4 simultaneously)
stringr::str_detect(employment_orig, fixed("8"))
~ "Not in paid employment (looking for work)",
# Contains option 1 or 2 AND do not contain option 5, 3, 4, or 8
(stringr::str_detect(employment_orig, fixed("1")) |
stringr::str_detect(employment_orig, fixed("2"))) &
!(stringr::str_detect(employment_orig, fixed("5"))) &
!(stringr::str_detect(employment_orig, fixed("3"))) &
!(stringr::str_detect(employment_orig, fixed("4"))) &
!(stringr::str_detect(employment_orig, fixed("8")))
~ "Student non-working (Full or part-time)",
# Contains option 7 or 9 AND do not contain option 1, or 2
# (it was not possible to select options 7 or 9
# and 8, 5, 3 or 4 simultaneously)
(stringr::str_detect(employment_orig, fixed("7")) |
stringr::str_detect(employment_orig, fixed("9"))) &
!(stringr::str_detect(employment_orig, fixed("1"))) &
!(stringr::str_detect(employment_orig, fixed("2")))
~ "Not in paid employment (by choice/health)",
# Contains option 6 AND do not contain option 5, 3, 4, 8, 1, 2, 7 or 9
# (it was not possible to select options 6 and 7, 8, 9, 5 simultaneously)
stringr::str_detect(employment_orig, fixed("6")) &
!(stringr::str_detect(employment_orig, fixed("3"))) &
!(stringr::str_detect(employment_orig, fixed("4"))) &
!(stringr::str_detect(employment_orig, fixed("1"))) &
!(stringr::str_detect(employment_orig, fixed("2")))
~ "Retired",
TRUE ~ NA_character_
),
levels = c(
"Not in paid employment (by choice/health)",
"Not in paid employment (looking for work)",
"Student non-working (Full or part-time)",
"Employed/working full-time (25+ hours per week)",
"Employed/working part-time (less than 25 hours per week)",
"Retired",
"Military service"
)
)
) |>
dplyr::relocate(employment_cat:employment_primary, .after = employment_orig)
# Sanity check: How many options were selected per participant?
df_merged |>
dplyr::mutate(number_of_options_selected =
if_else(is.na(employment_orig),
NA_integer_,
str_count(employment_orig, ",") + 1)) |>
count(number_of_options_selected)# A tibble: 3 × 2
number_of_options_selected n
<dbl> <int>
1 1 53491
2 2 6346
3 NA 9571
# Sanity check: View the distribution of primary employment
base::table(df_merged$employment_primary, useNA = "ifany")
Not in paid employment (by choice/health) Not in paid employment (looking for work)
3567 4171
Student non-working (Full or part-time) Employed/working full-time (25+ hours per week)
8478 33541
Employed/working part-time (less than 25 hours per week) Retired
6531 2592
Military service <NA>
957 9571
# Sanity check: Cross-tab between primary employment and original employment
print(table(df_merged$employment_primary,
df_merged$employment_orig, useNA = "ifany"), n = Inf)
1 1,3 1,4 1,5 1,6 1,7 1,8 1,9 2 2,3 2,4 2,5 2,6 2,7 2,8
Not in paid employment (by choice/health) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Not in paid employment (looking for work) 0 0 0 0 0 0 671 0 0 0 0 0 0 0 267
Student non-working (Full or part-time) 6511 0 0 0 18 328 0 114 1312 0 0 0 28 113 0
Employed/working full-time (25+ hours per week) 0 962 0 0 0 0 0 0 0 1440 0 0 0 0 0
Employed/working part-time (less than 25 hours per week) 0 0 984 0 0 0 0 0 0 0 628 0 0 0 0
Retired 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Military service 0 0 0 70 0 0 0 0 0 0 0 34 0 0 0
<NA> 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2,9 3 3,5 3,6 4 4,5 4,6 5 6 7 8 9 <NA>
Not in paid employment (by choice/health) 0 0 0 0 0 0 0 0 0 1839 0 1728 0
Not in paid employment (looking for work) 0 0 0 0 0 0 0 0 0 0 3233 0 0
Student non-working (Full or part-time) 54 0 0 0 0 0 0 0 0 0 0 0 0
Employed/working full-time (25+ hours per week) 0 30950 0 189 0 0 0 0 0 0 0 0 0
Employed/working part-time (less than 25 hours per week) 0 0 0 0 4737 0 182 0 0 0 0 0 0
Retired 0 0 0 0 0 0 0 0 2592 0 0 0 0
Military service 0 0 217 0 0 47 0 589 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0 0 0 0 0 9571
# Sanity check: View the counts of each option
df_merged |>
dplyr::group_by(employment_orig, employment_cat, employment_primary) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "employment_orig", width = 800)Citizenship and Ethnicity
This item allowed participants to select multiple choices. The first eight options referred to ethnicity. Only some countries contained the options related to ethnicity.
The last three options referred to citizenship status. Participants were not allowed to select Citizen of [country] and Resident of [country] (non-citizen) simultaneously. All countries contained the citizenship options.
Citizenship
$ethnicity_citizenship_orig
Please choose which best describes you. You must select at least one option from the top part and at least one option from the bottom. - Selected Choice
1,10 1,11 1,2,10 1,2,11 1,2,3,10 1,2,3,4,10 1,2,3,4,5,10
24538 444 321 6 11 4 1
1,2,3,4,5,6,10 1,2,3,4,5,6,7,8,11 1,2,3,4,6,7,10 1,2,3,4,7,10 1,2,3,4,7,8,9,10 1,2,3,5,10 1,2,3,5,8,10
1 1 1 1 1 2 1
1,2,3,5,9,10 1,2,3,8,10 1,2,3,9,10 1,2,4,10 1,2,4,11 1,2,4,5,10 1,2,4,6,7,10
1 2 1 115 3 10 1
1,2,4,6,8,10 1,2,4,8,10 1,2,4,8,11 1,2,4,9,10 1,2,5,10 1,2,6,10 1,2,6,11
1 11 1 5 5 2 1
1,2,6,8,10 1,2,7,10 1,2,7,8,10 1,2,8,10 1,2,8,11 1,2,9 1,2,9,10
1 1 1 19 2 14 16
1,2,9,11 1,3,10 1,3,11 1,3,4,10 1,3,4,5,10 1,3,4,6,10 1,3,5,10
1 164 2 8 1 1 3
1,3,5,9 1,3,6,10 1,3,8,10 1,3,8,11 1,3,8,9,10 1,3,8,9,11 1,3,9
1 1 11 1 2 1 2
1,3,9,10 1,4,10 1,4,11 1,4,5,10 1,4,5,11 1,4,5,8,10 1,4,5,9
5 134 5 14 1 3 1
1,4,5,9,10 1,4,6,10 1,4,6,8,10 1,4,7,10 1,4,7,8,10 1,4,8,10 1,4,8,9,10
1 1 1 3 1 9 2
1,4,9 1,4,9,10 1,4,9,11 1,5,10 1,5,11 1,5,6,8,10 1,5,6,9,10
3 8 1 104 4 1 1
1,5,7,10 1,5,8,10 1,5,9 1,5,9,10 1,5,9,11 1,6,10 1,6,11
2 10 4 3 2 72 1
1,6,7,10 1,6,8,10 1,6,8,9 1,6,9,10 1,6,9,11 1,7,10 1,7,11
1 5 1 3 1 46 1
1,7,8,10 1,7,8,9,10 1,7,9 1,7,9,10 1,7,9,11 1,8,10 1,8,11
1 2 2 2 1 584 30
1,8,9 1,8,9,10 1,8,9,11 1,9 1,9,10 1,9,11 10
28 17 2 664 368 95 10450
11 2,10 2,11 2,3,10 2,3,11 2,3,4,5,6,9,10 2,3,4,5,7,10
415 2307 222 14 3 1 1
2,3,4,5,9 2,3,4,8,11 2,3,4,9,11 2,3,5,11 2,3,5,9,11 2,3,6,8,10 2,3,7,10
1 1 1 3 1 1 2
2,3,8,10 2,3,8,9,11 2,3,9 2,3,9,11 2,4,10 2,4,11 2,4,5,6,10
2 1 3 2 80 8 1
2,4,5,6,7,10 2,4,5,7,10 2,4,6,10 2,4,7,10 2,4,8,10 2,4,9 2,4,9,10
1 1 2 1 2 2 2
2,5,10 2,5,11 2,5,7,10 2,5,9,11 2,6,10 2,6,7,10 2,6,8,10
12 3 1 4 10 1 1
2,6,9 2,6,9,11 2,7,10 2,7,8,9,10 2,7,9 2,7,9,10 2,8,10
2 1 40 1 2 6 83
2,8,11 2,8,9 2,8,9,10 2,8,9,11 2,9 2,9,10 2,9,11
4 32 3 5 306 46 60
3,10 3,11 3,4,10 3,4,5,7,10 3,4,6,11 3,4,7,10 3,4,7,11
1406 64 18 1 1 3 1
3,4,8,10 3,4,8,9 3,4,9 3,4,9,10 3,4,9,11 3,5,10 3,5,6,9,10
2 1 2 2 1 10 1
3,5,7,10 3,5,7,9 3,5,9 3,6,10 3,7,10 3,7,9 3,7,9,10
1 1 1 11 22 1 2
3,8,10 3,8,11 3,8,9 3,8,9,10 3,8,9,11 3,9 3,9,10
53 6 5 4 1 127 36
3,9,11 4,10 4,11 4,5,10 4,5,11 4,5,7,10 4,5,8,10
11 2701 97 30 1 7 3
4,5,9 4,6,10 4,6,11 4,6,7,10 4,6,8,10 4,6,9 4,7,10
2 25 2 1 2 1 135
4,7,11 4,7,8,10 4,7,8,9 4,7,9 4,7,9,10 4,7,9,11 4,8,10
2 2 1 6 9 2 100
4,8,11 4,8,9 4,8,9,10 4,9 4,9,10 4,9,11 5,10
4 7 2 179 56 28 2617
5,11 5,6,10 5,6,9 5,7,10 5,7,8,10 5,8,10 5,8,11
45 2 1 46 4 48 2
5,8,9 5,8,9,10 5,8,9,11 5,9 5,9,10 5,9,11 6,10
5 4 1 90 34 20 1233
6,11 6,7,10 6,7,9 6,7,9,10 6,8,10 6,8,11 6,8,9
18 10 1 1 40 1 1
6,8,9,11 6,9 6,9,10 6,9,11 7,10 7,11 7,8,10
1 33 14 14 4584 42 34
7,8,9,11 7,9 7,9,10 7,9,11 8,10 8,11 8,9
1 137 108 34 1715 101 98
8,9,10 8,9,11 9 9,10 9,11 <NA>
47 38 401 338 269 9825
Class: character
# Create variables for each citizenship option (9 to 11)
df_merged <- df_merged |>
dplyr::mutate(
citizenship_cat = base::factor(
dplyr::case_when(
# When option 10 = "Citizen of [country]" was selected
# and option 9 = "Born outside [country]" was not selected.
# It was not possible to select options 10 and 11 simultaneously.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10")) &
!(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")))
~ "Citizen",
# When option 11 = "Resident of [country] (non-citizen)" was selected
# and option 9 = "Born outside [country]" was not selected.
# It was not possible to select options 10 and 11 simultaneously.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11")) &
!(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")))
~ "Non-citizen (Permanent Resident)",
# When option 9 = "Born outside [country]" was selected
# and option 10 = "Citizen of [country]" was selected.
# It was not possible to select options 10 and 11 simultaneously.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10"))
~ "Born outside country (Citizen)",
# When option 9 = "Born outside [country]" was selected
# and option 11 = "Resident of [country] (non-citizen)" was selected.
# It was not possible to select options 10 and 11 simultaneously.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11"))
~ "Born outside country (Non-citizen, Permanent Resident)",
# When only option 9 = "Born outside [country]" was selected.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
!(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11"))) &
!(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10")))
~ "Born outside country (Non-citizen, Non-permanent Resident)",
TRUE ~ NA_character_
),
levels = c(
"Citizen",
"Non-citizen (Permanent Resident)",
"Born outside country (Citizen)",
"Born outside country (Non-citizen, Permanent Resident)",
"Born outside country (Non-citizen, Non-permanent Resident)")
)
) |>
dplyr::relocate(citizenship_cat, .after = ethnicity_citizenship_orig)
# Sanity check: View the distribution of citizenship categories
df_merged |>
dplyr::mutate(
# Extract only the citizenship options selected
citizenship_extract = stringr::str_extract_all(
ethnicity_citizenship_orig, "(?<=^|,)(9|10|11)(?=,|$)") |>
purrr::map_chr(\(i) {
if (length(i) == 0) return(NA_character_)
if (all(is.na(i))) return(NA_character_)
paste(i[!is.na(i)], collapse = ",")
})
) |>
dplyr::group_by(citizenship_extract, citizenship_cat) |>
dplyr::summarise(n = dplyr::n())# A tibble: 6 × 3
# Groups: citizenship_extract [6]
citizenship_extract citizenship_cat n
<chr> <fct> <int>
1 10 Citizen 54110
2 11 Non-citizen (Permanent Resident) 1549
3 9 Born outside country (Non-citizen, Non-permanent Resident) 2169
4 9,10 Born outside country (Citizen) 1155
5 9,11 Born outside country (Non-citizen, Permanent Resident) 600
6 <NA> <NA> 9825
Ethnicity
# Upload the ethnicity categories translated that were used for each country
ethnicity_cat <-
readr::read_csv("111_ethnicity_labels_translated.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 533
Columns: 3
$ UserLanguage <chr> "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AR-ARE", "AR-ARE", "AR-AR…
$ option_number <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 4, 5, 6, 1…
$ label <chr> "Armenians", "Ezidis", "Russians", "Assyrians", "Ukrainians", "Arab/Middle E…
# Extract the ethnicity options
df_merged <- df_merged |>
dplyr::mutate(
# Extract the ethnicity options
# Don't extract option 8 = "Specify: [open text field]"
# because that will be added later
ethnicity_agg = stringr::str_extract_all(
ethnicity_citizenship_orig,
"(?<=^|,)(1|2|3|4|5|6|7)(?=,|$)"
) |>
purrr::map_chr(\(i) {
# Participants that did not complete this item should have NA.
# Participants that completed a survey version
# without ethnicity options should have NA.
if (length(i) == 0) return(NA_character_)
if (all(is.na(i))) return(NA_character_)
paste(i[!is.na(i)], collapse = ",")
})
)
# Replace ethnicity options with the translated labels
# Transformation will be conducted in a temporary data frame for safety
df_temp <- df_merged |>
# Remove missing values for this transformation
dplyr::filter(!is.na(ethnicity_agg)) |>
# Separate values into rows
# (if participant wrote "1,2", create two rows: one with "1" and another with "2")
tidyr::separate_rows(ethnicity_agg, sep = ",") |>
# Create variable that is going to match with ethnicity_cat
dplyr::mutate(option_number = as.numeric(stringr::str_trim(ethnicity_agg))) |>
# Join ethnicity_cat to get the translated labels
dplyr::left_join(ethnicity_cat, by = c("UserLanguage", "option_number")) |>
# Bring back to former format of having multiple options in a single row
# but now with the translated labels instead of numbers
dplyr::group_by(ResponseId) |>
dplyr::summarise(
ethnicity_translated = base::paste(label[!is.na(label)], collapse = ",")
)
# Join back to main data frame
nrow(df_merged)[1] 69408
[1] 69408
# Cleanup
rm(df_temp)
# Sanity check: Are the number of missing values in the new variable the same
# as in the original variable plus those that only selected citizenship options
# or only the please specify option (8)?
sum(is.na(df_merged$ethnicity_translated)) ==
(sum(is.na(df_merged$ethnicity_citizenship_orig)) + sum(
!is.na(df_merged$ethnicity_citizenship_orig) &
stringr::str_detect(
df_merged$ethnicity_citizenship_orig,
"^(?:\\s*(?:8|9|10|11)\\s*)(?:,\\s*(?:8|9|10|11)\\s*)*$"
)
))[1] TRUE
# Add the cleaned responses from the "Specify: [open text field]" option (8)
ethnicity_recoded <-
readr::read_csv("111_ethnicity_open_answers_recoded.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 3,274
Columns: 2
$ ResponseId <chr> "R_8FrYunIVSiVeX5B", "R_8rYZOG6u8qXwprj", "R_8p9yE9TFIjGUonc", "R_2Lzosf…
$ ethnicity_specify <chr> "Cannot determine", "Cannot determine", "Cannot determine", "Cannot dete…
df_merged <- df_merged |>
dplyr::left_join(ethnicity_recoded, by = "ResponseId") |>
dplyr::relocate(ethnicity_agg:ethnicity_specify, .after = ethnicity_citizenship_orig)
# Sanity check
dplyr::glimpse(df_merged |>
dplyr::group_by(ethnicity_citizenship_orig, UserLanguage) |>
dplyr::distinct(ethnicity_citizenship_orig, UserLanguage,
ethnicity_agg, ethnicity_translated, ethnicity_specify,
.keep_all = TRUE) |>
dplyr::ungroup() |>
dplyr::select(UserLanguage, ethnicity_citizenship_orig,
ethnicity_agg, ethnicity_translated, ethnicity_specify),
width = 100)Rows: 2,567
Columns: 5
$ UserLanguage <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "FIL-PHL", "P…
$ ethnicity_citizenship_orig <chr> "3,6,10", "1,10", "5,10", "3,10", "1,10", "1,10", "1,9", "2,3,5…
$ ethnicity_agg <chr> "3,6", "1", "5", "3", "1", "1", "1", "2,3,5", "2", "2", "1,2", …
$ ethnicity_translated <chr> "Diola / Malinké,Haalpulaaren", "Wolof / Lébou", "White", "Blac…
$ ethnicity_specify <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Mo…
# View values
df_merged |>
dplyr::filter(!is.na(ethnicity_citizenship_orig)) |>
dplyr::mutate(
ethnicity_extract = stringr::str_extract_all(
ethnicity_citizenship_orig,
"(?<=^|,)(1|2|3|4|5|6|7|8)(?=,|$)"
) |>
purrr::map_chr(\(i) {
if (length(i) == 0) return(NA_character_)
if (all(is.na(i))) return(NA_character_)
paste(i[!is.na(i)], collapse = ",")
})
) |>
dplyr::group_by(country, ethnicity_extract, ethnicity_agg,
ethnicity_translated, ethnicity_specify) |>
dplyr::summarise(n = dplyr::n(), .groups = "drop") |>
dplyr::arrange(country) |>
print_reactable(sorted_col = "country", width = 800)Honeypot for Bots
An item was added to the survey, and via JavaScript code, the item was hidden from human participants. Bots that do not compute JavaScript code would process this item and provide an answer, which would allow us to identify them.
Optional Ending
This item was optional and can be NA. Also, this item was not shown to sponsored participants.
Childhood Socioeconomic Status
$childhood_SES
As a child, how would you describe the financial situation in your household compared to a typical home where you grew up?
1 2 3 4 5 <NA>
4564 10353 15104 8711 1219 29457
Class: numeric
# Create categorical variable with labels
df_merged <- df_merged |>
dplyr::mutate(
childhood_SES_cat = base::factor(
dplyr::case_when(
childhood_SES == 1 ~ "Poor",
childhood_SES == 2 ~ "Below average but not poor",
childhood_SES == 3 ~ "Around average",
childhood_SES == 4 ~ "Above average but not wealthy",
childhood_SES == 5 ~ "Wealthy",
TRUE ~ NA_character_
),
levels = c(
"Poor",
"Below average but not poor",
"Around average",
"Above average but not wealthy",
"Wealthy"
),
ordered = TRUE)) |>
dplyr::relocate(childhood_SES_cat, .after = childhood_SES)
# Sanity check: View the distribution of the new variable
base::table(df_merged$childhood_SES_cat, useNA = "always")
Poor Below average but not poor Around average Above average but not wealthy Wealthy
4564 10353 15104 8711 1219
<NA>
29457
Financial Outlook and Confidence
# Nothing to do here, the item is already numeric and
# within the minimum and maximum values.
table_label(df_merged$fin_outlook)$fin_outlook
What is your expectation for how things will be for you financially one year from now?
1 2 3 4 5 <NA>
1663 3952 13487 14794 5958 29554
Class: numeric
$fin_outlook_conf
On a scale from 1 (completely uncertain) to 10 (completely certain), how confident are you in your answer to the last question?
1 2 3 4 5 6 7 8 9 10 <NA>
995 527 1014 1732 4371 4260 6809 8042 4590 7514 29554
Class: numeric
df_merged <- df_merged |>
dplyr::mutate(
fin_outlook_cat = base::factor(
dplyr::case_when(
fin_outlook == 1 ~ "Things will be much worse",
fin_outlook == 2 ~ "Things will be somewhat worse",
fin_outlook == 3 ~ "Things will be about the same",
fin_outlook == 4 ~ "Things will be somewhat better",
fin_outlook == 5 ~ "Things will be much better",
TRUE ~ NA_character_
),
levels = c(
"Things will be much worse",
"Things will be somewhat worse",
"Things will be about the same",
"Things will be somewhat better",
"Things will be much better"
),
ordered = TRUE
)
) |>
dplyr::relocate(fin_outlook_cat, .after = fin_outlook)Attention and Care
$attention_care
There are people that care about and pay attention to what goes on in my life.
1 2 3 4 5 6 7 <NA>
836 838 1949 4431 13507 9044 9214 29589
Class: numeric
df_merged <- df_merged |>
dplyr::mutate(
attention_care_cat = base::factor(
dplyr::case_when(
attention_care == 1 ~ "Completely disagree",
attention_care == 2 ~ "Strongly disagree",
attention_care == 3 ~ "Disagree",
attention_care == 4 ~ "Neutral",
attention_care == 5 ~ "Agree",
attention_care == 6 ~ "Strongly agree",
attention_care == 7 ~ "Completely agree",
TRUE ~ NA_character_
),
levels = c(
"Completely disagree",
"Strongly disagree",
"Disagree",
"Neutral",
"Agree",
"Strongly agree",
"Completely agree"
),
ordered = TRUE)) |>
dplyr::relocate(attention_care_cat, .after = attention_care)
# Sanity check: View the distribution of the new variable
df_merged |>
dplyr::count(attention_care, attention_care_cat)# A tibble: 8 × 3
attention_care attention_care_cat n
<dbl> <ord> <int>
1 1 Completely disagree 836
2 2 Strongly disagree 838
3 3 Disagree 1949
4 4 Neutral 4431
5 5 Agree 13507
6 6 Strongly agree 9044
7 7 Completely agree 9214
8 NA <NA> 29589
Workplace Arragement
$work_arrangement
Which most accurately describes your current work (or study) arrangement?
1 2 3 4 5 <NA>
17717 6169 3558 3226 2890 35848
Class: numeric
# Create categorical variable with labels
df_merged <- df_merged %>%
dplyr::mutate(
work_arrangement_cat = base::factor(
dplyr::case_when(
work_arrangement == 1
~ "I work entirely in-person (i.e., in an office, on-site)",
work_arrangement == 2
~ "I mostly work in-person, with occasional remote days",
work_arrangement == 3
~ "I work about evenly in-person/remote",
work_arrangement == 4
~ "I mostly work remotely, with occasional in-person days",
work_arrangement == 5
~ "I work entirely remotely",
TRUE ~ NA_character_
),
levels = c(
"I work entirely in-person (i.e., in an office, on-site)",
"I mostly work in-person, with occasional remote days",
"I work about evenly in-person/remote",
"I mostly work remotely, with occasional in-person days",
"I work entirely remotely"
),
ordered = TRUE
),
work_arrangement_cat_nostudents = base::factor(
dplyr::if_else(
employment_primary == "Student non-working (Full or part-time)",
NA_character_,
as.character(work_arrangement_cat)
),
levels = levels(work_arrangement_cat),
ordered = TRUE
)
) |>
dplyr::relocate(work_arrangement_cat,
work_arrangement_cat_nostudents,
.after = work_arrangement)
# Sanity check: View the distribution of the new variable
df_merged |>
dplyr::group_by(work_arrangement, work_arrangement_cat) |>
dplyr::summarise(n = dplyr::n())# A tibble: 6 × 3
# Groups: work_arrangement [6]
work_arrangement work_arrangement_cat n
<dbl> <ord> <int>
1 1 I work entirely in-person (i.e., in an office, on-site) 17717
2 2 I mostly work in-person, with occasional remote days 6169
3 3 I work about evenly in-person/remote 3558
4 4 I mostly work remotely, with occasional in-person days 3226
5 5 I work entirely remotely 2890
6 NA <NA> 35848
# Sanity check: View the distribution of the new variable excluding students
base::table(df_merged$employment_primary,
df_merged$work_arrangement_cat_nostudents, useNA = "always")
I work entirely in-person (i.e., in an office, on-site)
Not in paid employment (by choice/health) 0
Not in paid employment (looking for work) 295
Student non-working (Full or part-time) 0
Employed/working full-time (25+ hours per week) 13053
Employed/working part-time (less than 25 hours per week) 1898
Retired 0
Military service 234
<NA> 0
I mostly work in-person, with occasional remote days I work about evenly in-person/remote
Not in paid employment (by choice/health) 0 0
Not in paid employment (looking for work) 118 79
Student non-working (Full or part-time) 0 0
Employed/working full-time (25+ hours per week) 4363 2251
Employed/working part-time (less than 25 hours per week) 773 581
Retired 0 0
Military service 51 48
<NA> 0 0
I mostly work remotely, with occasional in-person days I work entirely remotely <NA>
Not in paid employment (by choice/health) 0 0 3567
Not in paid employment (looking for work) 71 97 3511
Student non-working (Full or part-time) 0 0 8478
Employed/working full-time (25+ hours per week) 2051 1571 10252
Employed/working part-time (less than 25 hours per week) 586 632 2061
Retired 0 0 2592
Military service 37 28 559
<NA> 0 0 9571
Identification of Sponsored Participants
$br
id
5 <NA>
6445 62963
Class: numeric
$bs
pay
1 <NA>
1 69407
Class: numeric
0 1 <NA>
68208 1200 0
Class: numeric
# Create a new variable to identify sponsored participants
df_merged <- df_merged |>
dplyr::mutate(
sponsored = dplyr::if_else(
!is.na(br) | !is.na(bs) | irl == 1, 1, 0
)
)
# Sanity check
base::table(df_merged$sponsored, useNA = "always")
0 1 <NA>
61762 7646 0
A0.2. Applying exclusion criteria
Direct exclusion criteria
# Identify exclusion criteria and assign status
df_merged <- df_merged |>
# Create explicit flags for each rule
dplyr::mutate(
incomplete = is.na(debts_orig) & irl == 0,
# E1. Not resident based on manual checking of location validation
# important to note that the USA version was the default
# when the survey link was broken or shared without specifying a country
# in the URL metadata parameters.
not_resident = loc_resident == 0,
# E2. Implausible combination of working (3, 4, or 5 on employment)
# and reporting zero income.
working_zero_income =
(stringr::str_detect(employment_orig, "\\b(3|4|5)\\b")) &
(income_orig == 0 | income_text_clean == 0),
# E3. Implausible combination of being retired (6 on employment)
# and having an age <= 25
retired_young =
(stringr::str_detect(employment_orig, "\\b6\\b")) &
(!is.na(age) & age <= 25),
# E4. Implausible combination of reporting
# very high MPWB (well-being) and very high PHQ4 (distress)
extremes_mpwb_phq4 =
!is.na(gad_worry) &
(mpwb_sum >= 65 & phq4_sum >= 24),
# E5. Respondents reporting high MPWB (well-being) and high PHQ-4 (distress),
# combined with unusually short adjusted completion time.
high_mpwb_phq4_speed =
!is.na(gad_worry) &
!is.na(duration_adj) &
(mpwb_sum >= 64 & phq4_sum >= 23 & duration_adj < 10),
# E6. Too-fast based on raw duration,
# except sponsored participants from Ireland (who don't have duration data)
too_fast_raw = duration_sec < 150 & irl == 0,
# E7. We observed a China-specific pattern of
# unusually fast completion times and low response variance.
china_too_fast_low_var =
country == "China" &
duration_adj < 10 &
mpwb_var < 1
) |>
# Assign status based on the ordered exclusion criteria
# (first match is assigned and the rest ignored)
dplyr::mutate(
valid_status = base::factor(dplyr::case_when(
incomplete ~ "incomplete",
not_resident ~ "not residents",
working_zero_income ~ "implausible working with no income",
retired_young ~ "implausible retired young",
extremes_mpwb_phq4 ~ "implausible extremes",
high_mpwb_phq4_speed ~ "implausible high scores with speed",
too_fast_raw ~ "too fast general",
china_too_fast_low_var ~ "too fast low var",
TRUE ~ "passed"
),
levels = c(
"incomplete",
"not residents",
"implausible working with no income",
"implausible retired young",
"implausible extremes",
"implausible high scores with speed",
"too fast general",
"too fast low var",
"passed")
)
)
# Sanity checks: Overall counts per status
base::table(df_merged$valid_status, useNA = "always")
incomplete not residents implausible working with no income implausible retired young
12181 705 271 39
implausible extremes implausible high scores with speed too fast general too fast low var
48 7 1595 737
passed <NA>
53825 0
# Sanity checks: Check counts for incomplete
df_merged |>
dplyr::filter(incomplete) |>
dplyr::group_by(valid_status, Finished, debts_orig, phq_interest) |>
dplyr::summarise(max_progress = max(Progress), n_incomplete = dplyr::n())# A tibble: 1 × 6
# Groups: valid_status, Finished, debts_orig [1]
valid_status Finished debts_orig phq_interest max_progress n_incomplete
<fct> <dbl> <chr> <dbl> <dbl> <int>
1 incomplete 0 <NA> NA 77 12181
# Sanity checks: Check counts for not residents
df_merged |>
dplyr::filter(not_resident & !incomplete) |>
dplyr::group_by(valid_status, country, loc_country) |>
dplyr::summarise(n_not_resident = dplyr::n()) |>
dplyr::arrange(country) |>
print_reactable(sorted_col = "country", width = 500)# Sanity checks: Check counts for participants working with zero income
df_merged |>
dplyr::filter(working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, employment_cat, income_orig, income_text_orig) |>
dplyr::summarise(n_working_zero_income = dplyr::n()) |>
dplyr::arrange(-n_working_zero_income)# A tibble: 15 × 5
# Groups: valid_status, employment_cat, income_orig [15]
valid_status employment_cat income_orig income_text_orig n_working_zero_income
<fct> <chr> <dbl> <chr> <int>
1 implausible working with no income Employed/working full-time (25+ hours per week) 0 <NA> 119
2 implausible working with no income Employed/working part-time (less than 25 hours per week) 0 <NA> 45
3 implausible working with no income Employed/working full-time (25+ hours per week) 10 0 20
4 implausible working with no income Full-time student; Employed/working part-time (less than 25 … 0 <NA> 20
5 implausible working with no income Full-time student; Employed/working full-time (25+ hours per… 0 <NA> 14
6 implausible working with no income Part-time student; Employed/working part-time (less than 25 … 0 <NA> 13
7 implausible working with no income Part-time student; Employed/working full-time (25+ hours per… 0 <NA> 12
8 implausible working with no income Military service 0 <NA> 10
9 implausible working with no income Full-time student; Military service 0 <NA> 6
10 implausible working with no income Employed/working part-time (less than 25 hours per week) 10 0 3
11 implausible working with no income Part-time student; Military service 0 <NA> 3
12 implausible working with no income Employed/working full-time (25+ hours per week); Military se… 0 <NA> 2
13 implausible working with no income Military service 10 0 2
14 implausible working with no income Full-time student; Employed/working full-time (25+ hours per… 10 0 1
15 implausible working with no income Part-time student; Employed/working part-time (less than 25 … 10 0 1
# Sanity checks: Check counts for retired young participants
df_merged |>
dplyr::filter(retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, employment_orig, age_group) |>
dplyr::summarise(n_retired_young = dplyr::n()) |>
dplyr::arrange(-n_retired_young)# A tibble: 4 × 4
# Groups: valid_status, employment_orig [4]
valid_status employment_orig age_group n_retired_young
<fct> <chr> <fct> <int>
1 implausible retired young 6 18-25 27
2 implausible retired young 1,6 18-25 6
3 implausible retired young 2,6 18-25 2
4 implausible retired young 3,6 18-25 2
# Sanity checks: Check counts for extremes in MPWB and PHQ4
df_merged |>
dplyr::filter(extremes_mpwb_phq4 & !retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, mpwb_sum, phq4_sum) |>
dplyr::summarise(n_extremes_mpwb_phq4 = dplyr::n()) |>
dplyr::arrange(-n_extremes_mpwb_phq4)# A tibble: 6 × 4
# Groups: valid_status, mpwb_sum [3]
valid_status mpwb_sum phq4_sum n_extremes_mpwb_phq4
<fct> <dbl> <dbl> <int>
1 implausible extremes 70 28 12
2 implausible extremes 69 28 2
3 implausible extremes 67 25 1
4 implausible extremes 69 24 1
5 implausible extremes 70 25 1
6 implausible extremes 70 26 1
# Sanity checks: Check counts for participants with high scores on mpwb and phq4,
# plus unusual speed
df_merged |>
dplyr::filter(high_mpwb_phq4_speed & !extremes_mpwb_phq4 & !retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, mpwb_sum, phq4_sum, duration_adj) |>
dplyr::summarise(n_high_mpwb_phq4_speed = dplyr::n()) |>
dplyr::arrange(-n_high_mpwb_phq4_speed)# A tibble: 3 × 5
# Groups: valid_status, mpwb_sum, phq4_sum [3]
valid_status mpwb_sum phq4_sum duration_adj n_high_mpwb_phq4_speed
<fct> <dbl> <dbl> <dbl> <int>
1 implausible high scores with speed 64 23 7.5 1
2 implausible high scores with speed 64 28 5.32 1
3 implausible high scores with speed 66 23 6.04 1
# Sanity checks: Check counts for participants with high scores on mpwb and phq4,
# plus unusual speed
df_merged |>
dplyr::filter(too_fast_raw & !high_mpwb_phq4_speed &
!extremes_mpwb_phq4 & !retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status) |>
dplyr::summarise(
min(duration_sec), max(duration_sec), n_too_fast_raw = dplyr::n())# A tibble: 1 × 4
valid_status `min(duration_sec)` `max(duration_sec)` n_too_fast_raw
<fct> <dbl> <dbl> <int>
1 too fast general 48 149 718
# Sanity checks: Check counts for China-specific exclusion
df_merged |>
dplyr::filter(china_too_fast_low_var & !too_fast_raw &
!high_mpwb_phq4_speed & !extremes_mpwb_phq4 & !retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, country) |>
dplyr::summarise(
min(mpwb_var), max(mpwb_var),
min(duration_adj), max(duration_adj), n_china_too_fast_low_var = dplyr::n())# A tibble: 1 × 7
# Groups: valid_status [1]
valid_status country `min(mpwb_var)` `max(mpwb_var)` `min(duration_adj)` `max(duration_adj)` n_china_too_fast_low_var
<fct> <chr> <dbl> <dbl> <dbl> <dbl> <int>
1 too fast low var China 0 0.989 5 9.95 439
Assessments to the Financial variables
Collaborators reviewed the financial variables and created flags indicating whether the responses were valid or not. Basic demographic information about the participants was given only upon request to assist with the review. The income, assets, and debts values that fell within the first income bracket and the last bracket were considered valid by default.
In Zimbabwe, all values were sent for review because there was a concern that participants reported values in Zimbabwean dollar instead of USD as collaborators used in the translation. Also, 14 participants from USA with a value equal to the first income bracket should have been accepted automatically but were sent for review by mistake.
Collaborators were also asked to provide a minimum cut-off for each variable. When the minimum cut-off was higher than the first income bracket, their sheet was updated with the values between the first income bracket and the minimum cut-off. Values of 0 in either financial variable were automatically accepted as is and were not given to collaborators for revision. The values that contained NA, “,” or “.” were also requested for review in order to validate our cleaning script.
The countries where sociodemographic information were provided were: Albania, Bangladesh, Finland, Georgia, Japan, Latvia, Lebanon, Oman, Peru, Portugal, Qatar, Russia, Singapore, Switzerland, Timor-Leste, Ukraine, USA, and Zimbabwe.
This assessment was not conducted for the sponsored participants from Ireland, as they did not provide open field answers regarding income, and were not asked to report assets and debts.
# A manual revision of the values was conducted before the sheet was given to
# collaborators.
df_clean <- df_clean |>
dplyr::mutate(
fin_valid_aut_income =
dplyr::case_when(
# For participants that selected a decile
# instead of providing an open text answer, consider them accepted
income_orig < 10 ~ 1,
# Values of 0 are automatically accepted as is.
income_text_clean == 0 ~ 1,
# If value contains "," or "." or other non-digit, consider them not accepted,
# so collaborators can review them.
!(stringr::str_detect(income_text_orig, "^[0-9]+$")) ~ 0,
# If we detected a weird number, consider them not accepted.
income_wrd ~ 0,
# If value is above 0 but below the income first bracket,
# consider them not accepted.
irl == 0 &
!is.na(income_text_orig) &
income_text_clean != 0 &
income_text_clean > 0 & income_text_clean < income_highpoint_1 ~ 0,
# For all other participants, execute automatic assessment:
# The values that were within the income first bracket
# and the value of the last income bracket were considered not accepted.
irl == 0 &
!is.na(income_text_orig) &
income_text_clean != 0 &
income_text_clean >= income_highpoint_1 &
income_text_clean <= income_lowpoint_9 ~ 1,
# For values above the last income bracket, consider them not accepted.
irl == 0 &
!is.na(income_text_orig) &
income_text_clean != 0 &
income_text_clean > income_lowpoint_9 ~ 0,
TRUE ~ NA_real_
),
fin_valid_aut_assets =
dplyr::case_when(
# Sponsored participants from Ireland are assigned NA
# because no open text answers were collected from them.
irl == 1 ~ NA_real_,
assets_clean == 0 ~ 1,
!(stringr::str_detect(assets_orig, "^[0-9]+$")) ~ 0,
assets_wrd ~ 0,
irl == 0 &
(!is.na(assets_orig) &
assets_clean > 0 & assets_clean < income_highpoint_1) ~ 0,
irl == 0 &
(!is.na(assets_orig) &
assets_clean != 0 &
assets_clean >= income_highpoint_1 &
assets_clean <= income_lowpoint_9) ~ 1,
irl == 0 &
(!is.na(assets_orig) &
assets_clean != 0 &
assets_clean > income_lowpoint_9) ~ 0,
TRUE ~ NA_real_
),
fin_valid_aut_debts =
dplyr::case_when(
irl == 1 ~ NA_real_,
debts_clean == 0 ~ 1,
!(stringr::str_detect(debts_orig, "^[0-9]+$")) ~ 0,
debts_wrd ~ 0,
irl == 0 &
(!is.na(debts_orig) &
debts_clean > 0 & debts_clean < income_highpoint_1) ~ 0,
irl == 0 &
(!is.na(debts_orig) &
debts_clean != 0 &
debts_clean >= income_highpoint_1 &
debts_clean <= income_lowpoint_9) ~ 1,
irl == 0 &
(!is.na(debts_orig) &
debts_clean != 0 &
debts_clean > income_lowpoint_9) ~ 0,
TRUE ~ NA_real_
)
)
# Examine if the minimum cut-off provided is higher than the first income bracket.
df_clean <- df_clean |>
dplyr::mutate(
income_above_cutoff = income_cutoff_min > income_highpoint_1,
assets_above_cutoff = assets_cutoff_min > income_highpoint_1,
debts_above_cutoff = debts_cutoff_min > income_highpoint_1,
fin_valid_aut_income_update =
dplyr::case_when(
income_above_cutoff == FALSE ~ fin_valid_aut_income,
income_above_cutoff == TRUE &
income_text_clean >= income_highpoint_1 &
income_text_clean < income_cutoff_min ~ 0,
TRUE ~ fin_valid_aut_income
),
fin_valid_aut_assets_update =
dplyr::case_when(
assets_above_cutoff == FALSE ~ fin_valid_aut_assets,
assets_above_cutoff == TRUE &
assets_clean >= income_highpoint_1 &
assets_clean < assets_cutoff_min ~ 0,
TRUE ~ fin_valid_aut_assets
),
fin_valid_aut_debts_update =
dplyr::case_when(
debts_above_cutoff == FALSE ~ fin_valid_aut_debts,
debts_above_cutoff == TRUE &
debts_clean >= income_highpoint_1 &
debts_clean < debts_cutoff_min ~ 0,
TRUE ~ fin_valid_aut_debts
)
)
# Sanity check: View the counts of automatic financial validity
base::table(df_clean$fin_valid_aut_income, useNA = "always")
0 1 <NA>
3185 50531 109
0 1 <NA>
27728 24897 1200
0 1 <NA>
13503 39122 1200
After we transmitted the values that were not automatically classified to collaborators in each country for review, we received back their assessments. We have extracted automatically the sheet with their assessments, and combined them into a single file.
# Extract sections from Excel files in folder "777_countries_documentation"
files <- list.files(
path = "777_countries_documentation",
pattern = "\\.xls[x]?$",
full.names = TRUE) |>
purrr::discard(
# Exclude files named 777_Zambia and 777_Global
~stringr::str_detect(basename(.x),"^777_(Zambia|Global)"))
process_sheet <- function(path, sheet_name, start_row, tab_label) {
sheet_all <- readxl::read_excel(path, sheet = sheet_name, col_names = FALSE)
section <- sheet_all |> dplyr::slice(start_row:nrow(sheet_all)) |> dplyr::select(1:9)
# drop header row
section <- section |> dplyr::slice(-1)
names(section) <- c(
"ResponseId",
"UserLanguage",
"orig",
"clean",
"classification",
"value",
"cutoff_max",
"cutoff_min",
"notes"
)
section <- section |>
dplyr::mutate(
file = tools::file_path_sans_ext(basename(path)),
tab = tab_label
)
section
}
assessment_fin <- purrr::map_dfr(files, function(path) {
d1 <- process_sheet(path, "HOUSEHOLD INCOME", 22, "income")
d2 <- process_sheet(path, "ASSETS", 9, "assets")
d3 <- process_sheet(path, "DEBTS", 9, "debts")
dplyr::bind_rows(d1, d2, d3)
}) |>
dplyr::rename(
change = value
) |>
dplyr::mutate(
clean = base::as.numeric(clean),
cutoff_max = base::as.numeric(cutoff_max),
cutoff_min = base::as.numeric(cutoff_min)
) |>
tidyr::pivot_wider(
id_cols = c("ResponseId", "UserLanguage"),
names_from = "tab",
values_from = c(
"change",
"classification",
"cutoff_min",
"cutoff_max",
"orig",
"clean"
),
names_sep = "_"
)Rows: 33,245
Columns: 20
$ ResponseId <chr> "R_2S9d1LQe5gzhMGp", "R_2duaXZQf76tNTnX", "R_8YEzJo4GF1VSJiU", "R_8D…
$ UserLanguage <chr> "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "S…
$ change_income <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ change_assets <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ change_debts <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ classification_income <chr> "Not possible/not believable", "Not possible/not believable", "Not p…
$ classification_assets <chr> "OK", "OK", NA, "OK", "OK", "OK", "OK", "OK", "OK", NA, NA, NA, NA, …
$ classification_debts <chr> "OK", NA, "Cannot determine", NA, "Cannot determine", NA, NA, NA, NA…
$ cutoff_min_income <dbl> 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000…
$ cutoff_min_assets <dbl> 1000, 1000, NA, 1000, 1000, 1000, 1000, 1000, 1000, NA, NA, NA, NA, …
$ cutoff_min_debts <dbl> 1000, NA, 1000, NA, 1000, NA, NA, NA, NA, NA, NA, NA, NA, 1000, NA, …
$ cutoff_max_income <dbl> 2500000, 2500000, 2500000, 2500000, 2500000, 2500000, 2500000, 25000…
$ cutoff_max_assets <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cutoff_max_debts <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ orig_income <chr> "100", "410", "500", "600", "700", "800", "900", "1200", "2000", "30…
$ orig_assets <chr> "10000", "100000", NA, "10000", "5000", "5000", "3000", "500.000", "…
$ orig_debts <chr> "5000", NA, "500", NA, "250", NA, NA, NA, NA, NA, NA, NA, NA, "30000…
$ clean_income <dbl> 100, 410, 500, 600, 700, 800, 900, 1200, 2000, 3000, 3840, 5000, 700…
$ clean_assets <dbl> 10000, 100000, NA, 10000, 5000, 5000, 3000, 500000, 2000, NA, NA, NA…
$ clean_debts <dbl> 5000, NA, 500, NA, 250, NA, NA, NA, NA, NA, NA, NA, NA, 30000, NA, N…
# Sanity check: Compare the cut-off min values
fin_cut_income <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_income) |> dplyr::summarise() |> dplyr::filter(!is.na(cutoff_min_income)) |> dplyr::rename(income_cutoff_min = cutoff_min_income)
df_cut_income <- df_clean |> dplyr::group_by(UserLanguage, income_cutoff_min) |> dplyr::summarise() |> dplyr::filter(!is.na(income_cutoff_min) & UserLanguage %in% fin_cut_income$UserLanguage)
dplyr::setequal(fin_cut_income, df_cut_income)[1] TRUE
fin_cut_assets <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_assets) |> dplyr::summarise() |> dplyr::filter(!is.na(cutoff_min_assets)) |> dplyr::rename(assets_cutoff_min = cutoff_min_assets)
df_cut_assets <- df_clean |> dplyr::group_by(UserLanguage, assets_cutoff_min) |> dplyr::summarise() |> dplyr::filter(!is.na(assets_cutoff_min) & UserLanguage %in% fin_cut_assets$UserLanguage)
dplyr::setequal(fin_cut_assets, df_cut_assets)[1] TRUE
fin_cut_debts <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_debts) |> summarise() |> dplyr::filter(!is.na(cutoff_min_debts)) |> dplyr::rename(debts_cutoff_min = cutoff_min_debts)
df_cut_debts <- df_clean |> dplyr::group_by(UserLanguage, debts_cutoff_min) |> dplyr::summarise() |> filter(!is.na(debts_cutoff_min) & UserLanguage %in% fin_cut_debts$UserLanguage)
dplyr::setequal(fin_cut_debts, df_cut_debts)[1] TRUE
# Sanity check:
# Are there any UserLanguage in assessment_fin that are not in df_merged?
base::setdiff(
unique(assessment_fin$UserLanguage),
unique(df_merged$UserLanguage)
)character(0)
# Sanity check:
# All values between clean_income in assessment_fin
# and income_text_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_income) |> filter(!is.na(clean_income)) |>
dplyr::left_join(
df_clean |> select(ResponseId, income_text_clean),
by = "ResponseId"
) |>
dplyr::mutate(match = clean_income == income_text_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())# A tibble: 1 × 2
match n
<lgl> <int>
1 TRUE 3183
# All values between clean_assets in assessment_fin
# and assets_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_assets) |> filter(!is.na(clean_assets)) |>
dplyr::left_join(
df_clean |> select(ResponseId, assets_clean),
by = "ResponseId"
) |>
dplyr::mutate(match = clean_assets == assets_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())# A tibble: 1 × 2
match n
<lgl> <int>
1 TRUE 27351
# All values between clean_debts in assessment_fin
# and debts_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_debts) |> filter(!is.na(clean_debts)) |>
dplyr::left_join(
df_clean |> select(ResponseId, debts_clean),
by = "ResponseId"
) |>
dplyr::mutate(match = clean_debts == debts_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())# A tibble: 1 × 2
match n
<lgl> <int>
1 TRUE 12860
# Sanity check: Any duplicated ResponseId in assessment_fin?
assessment_fin |>
dplyr::count(ResponseId) |>
dplyr::filter(n > 1) |>
base::nrow()[1] 0
[1] 53825
df_clean <- df_clean |>
dplyr::left_join(
assessment_fin |> dplyr::select(
ResponseId,
classification_income,
change_income,
classification_assets,
change_assets,
classification_debts,
change_debts
),
by = c("ResponseId")
) |>
# Apply the changes recommended by collaborators
dplyr::mutate(
income_text_reviewed = dplyr::case_when(
!is.na(classification_income) &
stringr::str_detect(classification_income,
"Change to: \\[add value on column F\\]")
~ as.numeric(change_income),
TRUE ~ income_text_clean
),
assets_reviewed = dplyr::case_when(
!is.na(classification_assets) &
stringr::str_detect(classification_assets,
"Change to: \\[add value on column F\\]")
~ as.numeric(change_assets),
TRUE ~ assets_clean
),
debts_reviewed = dplyr::case_when(
!is.na(classification_debts) &
stringr::str_detect(classification_debts,
"Change to: \\[add value on column F\\]")
~ as.numeric(change_debts),
TRUE ~ debts_clean
)
)
nrow(df_clean)[1] 53825
# Sanity checks: View the counts of cells that were automatically approved
# and were still reviewed by collaborators
df_clean |> dplyr::select(ResponseId, country, income_text_clean,
classification_income, fin_valid_aut_income,
fin_valid_aut_income_update) |>
dplyr::filter(fin_valid_aut_income_update == 1 & !is.na(classification_income)
& income_text_clean > 0) |>
dplyr::group_by(country, classification_income) |>
dplyr::summarise(n = dplyr::n()) |>
base::nrow()[1] 0
df_clean |> dplyr::select(ResponseId, country, assets_clean,
classification_assets, fin_valid_aut_assets,
fin_valid_aut_assets_update) |>
dplyr::filter(fin_valid_aut_assets_update==1 & !is.na(classification_assets)
& assets_clean > 0) |>
dplyr::group_by(country, classification_assets) |>
dplyr::summarise(n = dplyr::n())# A tibble: 3 × 3
# Groups: country [2]
country classification_assets n
<chr> <chr> <int>
1 USA Cannot determine 2
2 USA OK 94
3 Zimbabwe OK 82
df_clean |> dplyr::select(ResponseId, country, debts_clean,
classification_debts, fin_valid_aut_debts,
fin_valid_aut_debts_update) |>
dplyr::filter(fin_valid_aut_debts_update== 1 & !is.na(classification_debts) &
debts_clean > 0) |>
dplyr::group_by(country, classification_debts) |>
dplyr::summarise(n = dplyr::n())# A tibble: 3 × 3
# Groups: country [2]
country classification_debts n
<chr> <chr> <int>
1 USA OK 14
2 Zimbabwe Change to: [add value on column F] 20
3 Zimbabwe OK 58
# Sanity checks: View the counts of cells that were automatically disapproved
# and were not reviewed by collaborators
df_clean |> dplyr::select(ResponseId, country, income_text_clean,
classification_income, fin_valid_aut_income,
fin_valid_aut_income_update) |>
dplyr::filter(fin_valid_aut_income_update == 0 & is.na(classification_income)
& income_text_clean > 0) |>
dplyr::group_by(country, classification_income) |>
dplyr::summarise(n = dplyr::n()) |>
base::nrow()[1] 0
df_clean |> dplyr::select(ResponseId, country, assets_clean,
classification_assets, fin_valid_aut_assets,
fin_valid_aut_assets_update) |>
dplyr::filter(fin_valid_aut_assets_update == 0 & is.na(classification_assets)
& assets_clean > 0) |>
dplyr::group_by(country, classification_assets) |>
dplyr::summarise(n = dplyr::n()) |>
base::nrow()[1] 0
df_clean |> dplyr::select(ResponseId, country, debts_clean, classification_debts,
fin_valid_aut_debts, fin_valid_aut_debts_update) |>
dplyr::filter(fin_valid_aut_debts_update == 0 & is.na(classification_debts)
& debts_clean > 0) |>
dplyr::group_by(country, classification_debts) |>
dplyr::summarise(n = dplyr::n()) |>
base::nrow()[1] 0
# Create variable where we fit the open field answers into the brackets
find_decile <- function(lang, income_val) {
# If value is missing, return NA for this row
if (is.na(income_val)) {
return(NA_real_)
}
# Subset brackets for language
brackets <- income_gaps[income_gaps$UserLanguage == lang, ]
# If no brackets available for this language, return NA
if (nrow(brackets) == 0) {
return(NA_real_)
}
for (j in seq_len(nrow(brackets))) {
low <- brackets$income_lowpoint_adj[j]
high <- brackets$income_highpoint_adj[j]
# Skip rows with missing low
if (is.na(low)) {
next
}
# Open-ended bracket: [low, ∞)
if (is.na(high)) {
if (income_val >= low) {
return(base::as.numeric(brackets$income_orig[j]))
} else {
next
}
}
# Interval [low, high] inclusive
if (income_val >= low && income_val <= high) {
return(base::as.numeric(brackets$income_orig[j]))
}
}
# If higher than all defined brackets, assign 9 by your current rule
9
}
df_clean <- df_clean |>
dplyr::mutate(
income_merg = dplyr::case_when(
is.na(income_orig) ~ NA_real_,
!is.na(income_orig) & income_orig != 10 ~ income_orig,
income_orig == 10 & is.na(income_text_reviewed) ~ NA_real_,
income_orig == 10 &
!is.na(income_text_reviewed) &
income_text_reviewed == 0 ~ 0,
TRUE ~ purrr::map2_dbl(
UserLanguage,
income_text_reviewed,
find_decile
)
),
income_merg_cat = base::factor(
dplyr::case_when(
income_merg == 0 ~ "No income",
income_merg == 1 ~ "Second decile",
income_merg == 2 ~ "Third decile",
income_merg == 3 ~ "Fourth decile",
income_merg == 4 ~ "Fifth decile",
income_merg == 5 ~ "Sixth decile",
income_merg == 6 ~ "Seventh decile",
income_merg == 7 ~ "Eighth decile",
income_merg == 8 ~ "Ninth decile",
income_merg == 9 ~ "Tenth decile",
TRUE ~ NA_character_
),
levels = c(
"No income",
"Second decile",
"Third decile",
"Fourth decile",
"Fifth decile",
"Sixth decile",
"Seventh decile",
"Eighth decile",
"Ninth decile",
"Tenth decile"
),
ordered = TRUE
),
income_merg_group = base::factor(
dplyr::case_when(
income_merg_cat == "No income"
~ "No income",
income_merg_cat %in% c("Second decile", "Third decile", "Fourth decile")
~ "Low",
income_merg_cat %in% c("Fifth decile", "Sixth decile")
~ "Mid",
income_merg_cat %in% c("Seventh decile", "Eighth decile", "Ninth decile")
~ "Upper",
income_merg_cat == "Tenth decile"
~ "Wealthiest",
TRUE ~ NA_character_
),
levels = c("No income", "Low", "Mid", "Upper", "Wealthiest"),
ordered = TRUE
)
)
df_clean <- df_clean |>
dplyr::left_join(
income_gaps |>
dplyr::select(
UserLanguage,
income_orig,
income_lowpoint_adj,
income_highpoint_adj
),
by = c("UserLanguage", "income_merg" = "income_orig")
) |>
dplyr::mutate(
income_merg_translated = dplyr::case_when(
is.na(income_merg) ~ NA_character_,
income_merg == 0 ~ "0",
# Closed interval [low, high]
!is.na(income_lowpoint_adj) &
!is.na(income_highpoint_adj)
~ paste0(
income_lowpoint_adj,
" - ",
income_highpoint_adj
),
# Open upper bound [low, ∞)
!is.na(income_lowpoint_adj) &
is.na(income_highpoint_adj)
~ paste0(
income_lowpoint_adj,
"+"
),
TRUE ~ NA_character_
)
)
# Sanity checks: View counts of merged income variable
df_clean |>
dplyr::filter(
income_orig == 10,
!is.na(income_text_reviewed)
) |>
dplyr::group_by(
UserLanguage,
income_merg,
income_merg_translated
) |>
dplyr::summarise(
min_income_text_reviewed = min(income_text_reviewed, na.rm = TRUE),
max_income_text_reviewed = max(income_text_reviewed, na.rm = TRUE)
) |>
print_reactable(sorted_col = "UserLanguage", width = 800)df_clean |>
dplyr::group_by(income_orig, income_merg) |>
dplyr::summarise(n = dplyr::n()) |>
base::print(n = Inf)# A tibble: 21 × 3
# Groups: income_orig [12]
income_orig income_merg n
<dbl> <dbl> <int>
1 0 0 1866
2 1 1 4775
3 2 2 5754
4 3 3 5961
5 4 4 5778
6 5 5 5195
7 6 6 4646
8 7 7 4583
9 8 8 3585
10 9 9 5347
11 10 0 91
12 10 1 1951
13 10 2 834
14 10 3 717
15 10 4 583
16 10 5 443
17 10 6 356
18 10 7 297
19 10 8 208
20 10 9 746
21 NA NA 109
Red flag exclusion
Each flag corresponds to a specific pattern that may indicate low-quality data. This process was only applied to participants who passed the direct exclusion criteria.
# Identified participants with IP addresses known to be associated with botnets.
botnet_ids <-
readr::read_csv("111_response_ids_botnets.csv", show_col_types = FALSE) |>
dplyr::pull(ResponseId) |>
base::trimws(); length(botnet_ids)[1] 262
# Identified participants with IP addresses massively repeated
# across multiple responses.
massive_rep_ids <-
readr::read_csv("111_ip_repeated.csv", show_col_types = FALSE) |>
dplyr::pull(ResponseId) |>
base::trimws(); length(massive_rep_ids)[1] 10300
# Start the flagging process.
df_flagged <- df_clean |>
dplyr::mutate(
# F1. Household >=4 and zero income
flag_hh4_zero_income =
dplyr::if_else(
irl == 0 &
household_size >= 4 &
income_merg == 0,
1,
0,
missing = NA_real_
),
# F2. Any financial items not valid
flag_fin_invalid =
dplyr::if_else(
irl == 0 &
(
(!is.na(classification_assets) &
!(classification_assets %in% c("OK", "Change to: [add value on column F]"))) |
(!is.na(classification_debts) &
!(classification_debts %in% c("OK", "Change to: [add value on column F]"))) |
(!is.na(classification_income) &
!(classification_income %in% c("OK", "Change to: [add value on column F]")))
),
1,
0,
missing = NA_real_
),
# F3. Low variance in MPWB, life satisfaction = 10,
# and no income or very low education
flag_ls10_noincome_var =
dplyr::if_else(
irl == 0 &
mpwb_var < 1 &
life_satisfaction == 10 &
(education_recoded == 1 |
income_merg == 0),
1,
0,
missing = NA_real_
),
# F4. Assets and debts are the same value (excluding both zero and NA)
flag_assets_debts_same =
dplyr::if_else(
irl == 0 &
!is.na(assets_clean) &
!is.na(debts_clean) &
assets_clean == debts_clean &
!(assets_clean == 0 & debts_clean == 0),
1,
0,
missing = NA_real_
),
# F5. Full-time student and lowest education level
# (Peru participants that selected inclusive education are exempt because
# they have NA in education_recoded)
flag_student_lowedu =
dplyr::if_else(
!is.na(education_recoded) &
education_recoded == 1 &
!is.na(employment_orig) &
stringr::str_detect(employment_orig, "\\b1\\b"),
1,
0,
missing = NA_real_
),
# F6. Zero variance in MPWB items
flag_mpwb_zerovar =
dplyr::if_else(
!is.na(mpwb_var) & mpwb_var == 0,
0.5,
0,
missing = NA_real_
),
# F7. Nonsensical sex or ethnicity
flag_nonsensical_sex_ethn =
dplyr::if_else(
irl == 0 &
(
(!is.na(sex_text_recoded) & sex_text_recoded == "Cannot determine") |
(!is.na(ethnicity_specify) & ethnicity_specify == "Cannot determine")
),
1,
0,
missing = NA_real_
),
# F8. High MPWB and high PHQ-4
flag_high_mpwb_phq4 =
dplyr::if_else(
irl == 0 &
!is.na(gad_worry) &
mpwb_sum >= 60 &
phq4_sum >= 20 &
duration_adj < 10,
1,
0,
missing = NA_real_
),
# F9. LS vs mean MPWB mismatch
flag_ls_vs_mpwb =
dplyr::case_when(
base::abs(life_satisfaction - mpwb_mean) > 5 ~ 2,
base::abs(life_satisfaction - mpwb_mean) > 4 ~ 1,
TRUE ~ 0
),
# F10. Age >75 and working or studying
flag_age75_workstudy =
dplyr::if_else(
age >= 75 &
!is.na(employment_orig) &
stringr::str_detect(employment_orig, "\\b(1|2|3|4|5)\\b"),
1,
0,
missing = NA_real_
),
# F11. Advanced education and <22 years
flag_young_advance =
dplyr::if_else(
!is.na(education_recoded) &
education_recoded == 5 &
age < 22,
1,
0,
missing = NA_real_
),
# F12. Short duration and low MPWB variance
flag_duration_var =
dplyr::if_else(
irl == 0 &
duration_adj < 10 &
mpwb_var < 1,
1,
0,
missing = NA_real_
),
# F13. Independent, <20, and richest income
flag_young_rich_alone =
dplyr::if_else(
household_size == 1 &
age < 20 &
income_merg >= 8,
1,
0,
missing = NA_real_
),
# F14. Retired and working at the same time
flag_retired_working =
dplyr::if_else(
irl == 0 &
!is.na(employment_orig) &
stringr::str_detect(employment_orig, "\\b6\\b") &
stringr::str_detect(employment_orig, "\\b(3|4|5)\\b"),
1,
0,
missing = NA_real_
),
# F15. Strange numbers in financial variables
# If collaborators already marked the value as not OK,
# then there is no need to repeat this flag.
flag_weird_nr =
dplyr::case_when(
irl == 0 &
(
(assets_wrd & classification_assets == "OK") |
(debts_wrd & classification_debts == "OK") |
(income_wrd & classification_income == "OK")
)
~ 1,
TRUE ~ 0
),
# F16. Botnet ResponseIds
flag_botnet =
dplyr::if_else(
ResponseId %in% botnet_ids,
1,
0,
missing = NA_real_
),
# F17. Massive repetition of IP + short duration + low variance
flag_rep =
dplyr::if_else(
ResponseId %in% massive_rep_ids &
duration_adj < 10 &
mpwb_var < 1,
1,
0,
missing = NA_real_
),
# Total flags
flag_total =
base::rowSums(
dplyr::across(dplyr::starts_with("flag_")),
na.rm = TRUE
),
# Exclusion flag
exclusion_flags =
dplyr::if_else(
flag_total > 4,
1,
0
),
# Update valid_status
valid_status = base::as.character(valid_status),
valid_status =
base::factor(
dplyr::case_when(
exclusion_flags == 1 ~ "flagged",
TRUE ~ valid_status
),
levels = c(
"flagged",
"passed"
)
)
)
# Sanity Check: View the counts of exclusion flags
table(df_flagged$valid_status, df_flagged$exclusion_flags, useNA = "always")
0 1 <NA>
flagged 0 26 0
passed 53799 0 0
<NA> 0 0 0
# Sanity Check: View the counts and percentages of each flag
df_flagged |>
dplyr::select(dplyr::starts_with("flag_")) |>
dplyr::summarise(
dplyr::across(
dplyr::everything(),
~ sum((!is.na(.) & . != 0))
)
) |>
tidyr::pivot_longer(
cols = dplyr::everything(),
names_to = "flag",
values_to = "n_flagged"
) |>
dplyr::mutate(
percent_flagged = 100 * n_flagged / nrow(df_flagged)
) |>
dplyr::arrange(dplyr::desc(percent_flagged))# A tibble: 18 × 3
flag n_flagged percent_flagged
<chr> <int> <dbl>
1 flag_total 16396 30.5
2 flag_duration_var 6935 12.9
3 flag_fin_invalid 4366 8.11
4 flag_ls_vs_mpwb 3146 5.84
5 flag_mpwb_zerovar 1804 3.35
6 flag_nonsensical_sex_ethn 1052 1.95
7 flag_assets_debts_same 703 1.31
8 flag_rep 621 1.15
9 flag_hh4_zero_income 493 0.916
10 flag_retired_working 337 0.626
11 flag_ls10_noincome_var 215 0.399
12 flag_botnet 202 0.375
13 flag_student_lowedu 162 0.301
14 flag_weird_nr 124 0.230
15 flag_age75_workstudy 70 0.130
16 flag_young_advance 50 0.0929
17 flag_high_mpwb_phq4 29 0.0539
18 flag_young_rich_alone 8 0.0149
# Sanity check: View combinations of classifications where flag_hh4_zero_income is raised
df_flagged |>
dplyr::filter(flag_hh4_zero_income == 1) |>
dplyr::group_by(household_size, income_merg, irl) |>
dplyr::summarise(n = dplyr::n())# A tibble: 15 × 4
# Groups: household_size, income_merg [15]
household_size income_merg irl n
<dbl> <dbl> <dbl> <int>
1 4 0 0 196
2 5 0 0 120
3 6 0 0 68
4 7 0 0 41
5 8 0 0 20
6 9 0 0 10
7 10 0 0 15
8 11 0 0 1
9 12 0 0 4
10 13 0 0 3
11 14 0 0 3
12 15 0 0 2
13 17 0 0 1
14 18 0 0 1
15 20 0 0 8
# Sanity check: View combinations of classifications where flag_fin_invalid is raised
df_flagged |>
dplyr::filter(flag_fin_invalid == 1) |>
dplyr::group_by(classification_income, classification_assets, classification_debts, irl) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "classification_income", width = 800)# Sanity check: View combinations of classifications where flag_ls10_noincome_var is raised
df_flagged |>
dplyr::filter(flag_ls10_noincome_var == 1) |>
dplyr::group_by(mpwb_var, life_satisfaction, education_recoded, income_merg, irl) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "mpwb_var", width = 800)# Sanity check: View combinations of classifications where flag_assets_debts_same is raised
df_flagged |>
dplyr::filter(flag_assets_debts_same == 1) |>
dplyr::group_by(assets_clean, debts_clean, irl) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "assets_clean", width = 500)# Sanity check: View combinations of classifications where flag_student_lowedu is raised
df_flagged |>
dplyr::filter(flag_student_lowedu == 1) |>
dplyr::group_by(education_recoded, employment_orig) |>
dplyr::summarise(n = dplyr::n())# A tibble: 8 × 3
# Groups: education_recoded [1]
education_recoded employment_orig n
<dbl> <chr> <int>
1 1 1 130
2 1 1,3 7
3 1 1,4 7
4 1 1,5 1
5 1 1,6 1
6 1 1,7 3
7 1 1,8 6
8 1 1,9 7
# Sanity check: View combinations of classifications where flag_mpwb_zerovar is raised
df_flagged |>
dplyr::filter(flag_mpwb_zerovar == 0.5) |>
dplyr::group_by(mpwb_var) |>
dplyr::summarise(n = dplyr::n())# A tibble: 1 × 2
mpwb_var n
<dbl> <int>
1 0 1804
# Sanity check: View combinations of classifications where flag_nonsensical_sex_ethn is raised
df_flagged |>
dplyr::filter(flag_nonsensical_sex_ethn == 1) |>
dplyr::group_by(sex_text_recoded, ethnicity_specify, irl) |>
dplyr::summarise(n = dplyr::n())# A tibble: 10 × 4
# Groups: sex_text_recoded, ethnicity_specify [10]
sex_text_recoded ethnicity_specify irl n
<chr> <chr> <dbl> <int>
1 Cannot determine Cannot determine 0 14
2 Cannot determine Cypriot 0 1
3 Cannot determine Other 0 5
4 Cannot determine Roma 0 1
5 Cannot determine The 4 Rs: (Nkole, Kiga, Batooro and Banyoro) 0 1
6 Cannot determine White 0 1
7 Cannot determine <NA> 0 81
8 Male Cannot determine 0 2
9 Non-binary Cannot determine 0 6
10 <NA> Cannot determine 0 940
# Sanity check: View combinations of classifications where flag_high_mpwb_phq4 is raised
df_flagged |>
dplyr::filter(flag_high_mpwb_phq4 == 1) |>
dplyr::group_by(mpwb_sum, phq4_sum, gad_worry, duration_adj, irl) |>
dplyr::summarise(n = dplyr::n()) |>
base::print(n = Inf)# A tibble: 29 × 6
# Groups: mpwb_sum, phq4_sum, gad_worry, duration_adj [29]
mpwb_sum phq4_sum gad_worry duration_adj irl n
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 60 20 5 7.66 0 1
2 60 22 5 6.41 0 1
3 60 25 6 9.03 0 1
4 60 28 7 7.66 0 1
5 61 22 4 9.59 0 1
6 61 28 7 7.10 0 1
7 62 20 4 7.8 0 1
8 62 22 7 8.83 0 1
9 62 25 5 9.66 0 1
10 62 25 7 6.72 0 1
11 62 26 6 7.79 0 1
12 62 28 7 9.39 0 1
13 62 28 7 9.93 0 1
14 63 20 6 9.17 0 1
15 63 21 4 5.55 0 1
16 63 24 5 6.76 0 1
17 63 24 6 6.59 0 1
18 63 24 6 8.2 0 1
19 63 24 6 9.10 0 1
20 63 25 7 5.90 0 1
21 63 28 7 7.03 0 1
22 64 22 7 8.23 0 1
23 65 20 5 9 0 1
24 65 21 6 5.37 0 1
25 66 21 6 7.83 0 1
26 66 22 6 9.38 0 1
27 70 20 5 6.90 0 1
28 70 20 7 7.93 0 1
29 70 22 4 6.6 0 1
# Sanity check: View combinations of classifications where flag_ls_vs_mpwb is raised
df_flagged |>
dplyr::filter(flag_ls_vs_mpwb >= 1) |>
dplyr::mutate(diff_ls_mpwb = base::abs(life_satisfaction - mpwb_mean)) |>
dplyr::group_by(life_satisfaction, mpwb_mean, diff_ls_mpwb, flag_ls_vs_mpwb, irl) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "mpwb_mean", width = 500)# Sanity check: View combinations of classifications where flag_age75_workstudy is raised
df_flagged |>
dplyr::filter(flag_age75_workstudy == 1) |>
dplyr::group_by(employment_cat, age) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "employment_cat", width = 800)# Sanity check: View combinations of classifications where flag_young_advance is raised
df_flagged |>
dplyr::filter(flag_young_advance == 1) |>
dplyr::group_by(education_recoded_cat, age) |>
dplyr::summarise(n = dplyr::n())# A tibble: 4 × 3
# Groups: education_recoded_cat [1]
education_recoded_cat age n
<ord> <dbl> <int>
1 Advanced 18 5
2 Advanced 19 13
3 Advanced 20 12
4 Advanced 21 20
# Sanity check: View combinations of classifications where flag_duration_var is raised
df_flagged |>
dplyr::filter(flag_duration_var == 1) |>
dplyr::mutate(min_duration_adj = min(duration_adj),
max_duration_adj = max(duration_adj),
min_mpwb_var = min(mpwb_var),
max_mpwb_var = max(mpwb_var)) |>
dplyr::group_by(min_duration_adj, max_duration_adj,
min_mpwb_var, max_mpwb_var, irl) |>
dplyr::summarise(n = dplyr::n())# A tibble: 1 × 6
# Groups: min_duration_adj, max_duration_adj, min_mpwb_var, max_mpwb_var [1]
min_duration_adj max_duration_adj min_mpwb_var max_mpwb_var irl n
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 5 9.97 0 0.989 0 6935
# Sanity check: View combinations of classifications where flag_young_rich_alone is raised
df_flagged |>
dplyr::filter(flag_young_rich_alone == 1) |>
dplyr::group_by(household_size, age, income_merg) |>
summarise(n = dplyr::n())# A tibble: 4 × 4
# Groups: household_size, age [2]
household_size age income_merg n
<dbl> <dbl> <dbl> <int>
1 1 18 8 2
2 1 18 9 1
3 1 19 8 3
4 1 19 9 2
# Sanity check: View combinations of classifications where flag_retired_working is raised
df_flagged |>
dplyr::filter(flag_retired_working == 1) |>
dplyr::group_by(employment_orig, age, irl) |>
summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "employment_orig", width = 500)# Sanity check: View combinations of classifications where flag_weird_nr is raised
df_flagged |>
dplyr::filter(flag_weird_nr == 1) |>
dplyr::group_by(assets_wrd, classification_assets,
debts_wrd, classification_debts,
income_wrd, classification_income, irl) |>
dplyr::summarise(n = dplyr::n()) |>
base::print(n = Inf)# A tibble: 25 × 8
# Groups: assets_wrd, classification_assets, debts_wrd, classification_debts, income_wrd, classification_income [25]
assets_wrd classification_assets debts_wrd classification_debts income_wrd classification_income irl n
<lgl> <chr> <lgl> <chr> <lgl> <chr> <dbl> <int>
1 FALSE OK FALSE <NA> TRUE OK 0 16
2 FALSE OK TRUE OK FALSE OK 0 2
3 FALSE OK TRUE OK FALSE Very unlikely to be true 0 2
4 FALSE OK TRUE OK FALSE <NA> 0 19
5 FALSE OK TRUE <NA> TRUE OK 0 1
6 FALSE Very unlikely to be true FALSE <NA> TRUE OK 0 1
7 FALSE Very unlikely to be true TRUE OK FALSE <NA> 0 2
8 FALSE <NA> FALSE OK TRUE OK 0 3
9 FALSE <NA> FALSE <NA> TRUE OK 0 6
10 FALSE <NA> TRUE OK FALSE OK 0 1
11 FALSE <NA> TRUE OK FALSE <NA> 0 13
12 TRUE Not possible/not believable TRUE Not possible/not believable TRUE OK 0 1
13 TRUE OK FALSE Not possible/not believable FALSE <NA> 0 3
14 TRUE OK FALSE OK FALSE Change to: [add value on column F] 0 1
15 TRUE OK FALSE OK FALSE OK 0 1
16 TRUE OK FALSE OK FALSE <NA> 0 6
17 TRUE OK FALSE Very unlikely to be true FALSE <NA> 0 7
18 TRUE OK FALSE <NA> FALSE Change to: [add value on column F] 0 1
19 TRUE OK FALSE <NA> FALSE OK 0 1
20 TRUE OK FALSE <NA> FALSE Very unlikely to be true 0 1
21 TRUE OK FALSE <NA> FALSE <NA> 0 31
22 TRUE OK TRUE Cannot determine FALSE <NA> 0 1
23 TRUE OK TRUE OK FALSE <NA> 0 1
24 TRUE OK TRUE Very unlikely to be true FALSE Very unlikely to be true 0 1
25 TRUE Very unlikely to be true TRUE OK FALSE <NA> 0 2
# Apply reviewed financial variables, unless all three flags F4, F15, and F2 are raised
df_flagged <- df_flagged |>
dplyr::mutate(
assets_reviewed = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_real_,
!is.na(classification_assets) &
!classification_assets %in% c("OK", "Change to: [add value on column F]")
~ NA_real_,
TRUE ~ assets_reviewed
),
debts_reviewed = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_real_,
!is.na(classification_debts) &
!classification_debts %in% c("OK", "Change to: [add value on column F]")
~ NA_real_,
TRUE ~ debts_reviewed
),
income_text_reviewed = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_real_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_real_,
TRUE ~ income_text_reviewed
),
income_merg = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_real_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_real_,
TRUE ~ income_merg
),
income_merg_translated = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_character_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_character_,
TRUE ~ income_merg_translated
),
income_merg_group = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_character_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_character_,
TRUE ~ income_merg_group
),
income_merg_cat = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_character_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_character_,
TRUE ~ income_merg_cat
),
income_merg_cat = base::factor(
income_merg_cat,
levels = c(
"No income",
"Second decile",
"Third decile",
"Fourth decile",
"Fifth decile",
"Sixth decile",
"Seventh decile",
"Eighth decile",
"Ninth decile",
"Tenth decile"
),
ordered = TRUE
),
income_merg_group = base::factor(
income_merg_group,
levels = c(
"No income",
"Low",
"Mid",
"Upper",
"Wealthiest"
),
ordered = TRUE
)
)
# Sanity check:
df_flagged |>
dplyr::filter(is.na(income_text_reviewed) & !is.na(income_merg) & income_orig == 10) |>
base::nrow()[1] 0
df_flagged |>
dplyr::filter(
is.na(income_text_reviewed) &
income_orig ==10 &
classification_income %in% c("OK", "Change to: [remove value]")) |>
base::nrow()[1] 0
df_flagged |>
dplyr::filter(
is.na(income_text_reviewed) &
income_orig == 10 &
is.na(classification_income)) |>
base::nrow()[1] 0
Exclusion Summary
# Combine direct exclusions and flags
df_exclusion <- df_merged |>
dplyr::left_join(
df_flagged |>
dplyr::select(ResponseId, exclusion_flags),
by = "ResponseId"
) |>
dplyr::mutate(
exclusion_criteria = base::factor(
dplyr::case_when(
valid_status %in% c("incomplete","not residents") ~ valid_status,
valid_status %in% c(
"implausible working with no income",
"implausible retired young",
"implausible extremes",
"implausible high scores with speed"
) ~ "implausible",
valid_status %in% c("too fast general","too fast low var") ~ "too fast",
valid_status == "passed" &
!is.na(exclusion_flags) & exclusion_flags == 1 ~ "flagged",
valid_status == "passed" &
(is.na(exclusion_flags) | exclusion_flags == 0) ~ "valid",
TRUE ~ NA_character_
),
levels = c(
"valid",
"incomplete",
"too fast",
"not residents",
"implausible",
"flagged"
)
)
)
# Country-level summary
summary_table <- df_exclusion |>
dplyr::group_by(country) |>
dplyr::summarise(
initial_number_of_participants = dplyr::n(),
valid_participants = base::sum(exclusion_criteria == "valid", na.rm = TRUE),
incomplete = base::sum(exclusion_criteria == "incomplete", na.rm = TRUE),
too_fast = base::sum(exclusion_criteria == "too fast", na.rm = TRUE),
not_residents = base::sum(exclusion_criteria == "not residents", na.rm = TRUE),
implausible = base::sum(exclusion_criteria == "implausible", na.rm = TRUE),
flagged = base::sum(exclusion_criteria == "flagged", na.rm = TRUE)
) |>
dplyr::mutate(
total_exclusions =
initial_number_of_participants - valid_participants,
total_pct_lost =
(total_exclusions / initial_number_of_participants) * 100
)
# Total row
total_row <- summary_table |>
dplyr::summarise(
country = "Total",
initial_number_of_participants =
base::sum(initial_number_of_participants),
valid_participants = base::sum(valid_participants),
incomplete = base::sum(incomplete),
too_fast = base::sum(too_fast),
not_residents = base::sum(not_residents),
implausible = base::sum(implausible),
flagged = base::sum(flagged),
total_exclusions = base::sum(total_exclusions),
total_pct_lost =
(total_exclusions / initial_number_of_participants) * 100
)
summary_table_pct <- dplyr::bind_rows(summary_table, total_row) |>
dplyr::mutate(
incomplete = paste0(
incomplete,
" (",
format(round((incomplete / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
too_fast = paste0(
too_fast,
" (",
format(round((too_fast / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
not_residents = paste0(
not_residents,
" (",
format(round((not_residents / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
implausible = paste0(
implausible,
" (",
format(round((implausible / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
flagged = paste0(
flagged,
" (",
format(round((flagged / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
total_exclusions = paste0(
total_exclusions,
" (",
format(round(total_pct_lost, 2), nsmall = 1),
"%)"
)
) |>
dplyr::select(-total_pct_lost)
# gt
summary_table_pdf <- summary_table_pct |>
dplyr::rename(
Country = country,
"Initial number of participants" = initial_number_of_participants,
"Valid participants" = valid_participants,
"Incomplete" = incomplete,
"Too fast" = too_fast,
"Not residents" = not_residents,
"Implausible combinations" = implausible,
"Flagged" = flagged,
"Total exclusions" = total_exclusions
)
gt_table <- summary_table_pdf |>
gt::gt() |>
gt::cols_width(
Country ~ gt::px(65),
`Initial number of participants` ~ gt::px(70),
`Valid participants` ~ gt::px(70),
Incomplete ~ gt::px(70),
`Too fast` ~ gt::px(70),
`Not residents` ~ gt::px(70),
`Implausible combinations` ~ gt::px(70),
`Flagged` ~ gt::px(70),
`Total exclusions` ~ gt::px(105)
) |>
gt::tab_options(
table.font.size = 10,
column_labels.font.size = 11,
table.background.color = "white",
table.align = "center",
table.width = gt::px(650),
table.border.top.color = "white",
table.border.bottom.color = "white",
table.border.left.color = "white",
table.border.right.color = "white",
table_body.hlines.color = "black",
table_body.vlines.color = "white",
column_labels.vlines.color = "white",
column_labels.border.top.color = "white",
column_labels.border.bottom.color = "black"
) |>
gt::opt_table_lines() |>
gt::tab_style(
style = list(
gt::cell_text(
weight = "bold",
align = "center"
)
),
locations = gt::cells_column_labels(gt::everything())
) |>
gt::tab_style(
style = gt::cell_text(
align = "left"
),
locations = gt::cells_body(columns = Country)
) |>
gt::tab_style(
style = gt::cell_text(
align = "center"
),
locations = gt::cells_body(
columns = c(
`Initial number of participants`,
`Valid participants`,
Incomplete,
`Too fast`,
`Not residents`,
`Implausible combinations`,
`Flagged`,
`Total exclusions`
)
)
); gt_table| Country | Initial number of participants | Valid participants | Incomplete | Too fast | Not residents | Implausible combinations | Flagged | Total exclusions |
|---|---|---|---|---|---|---|---|---|
| Albania | 2284 | 1758 | 487 (21.32%) | 4 ( 0.18%) | 34 (1.49%) | 1 (0.04%) | 0 (0.00%) | 526 (23.03%) |
| Algeria | 203 | 149 | 53 (26.11%) | 1 ( 0.49%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 54 (26.60%) |
| Angola | 329 | 240 | 68 (20.67%) | 15 ( 4.56%) | 1 (0.30%) | 5 (1.52%) | 0 (0.00%) | 89 (27.05%) |
| Argentina | 769 | 634 | 119 (15.47%) | 3 ( 0.39%) | 11 (1.43%) | 2 (0.26%) | 0 (0.00%) | 135 (17.56%) |
| Armenia | 334 | 246 | 83 (24.85%) | 1 ( 0.30%) | 1 (0.30%) | 3 (0.90%) | 0 (0.00%) | 88 (26.35%) |
| Australia | 605 | 500 | 67 (11.07%) | 25 ( 4.13%) | 7 (1.16%) | 5 (0.83%) | 1 (0.17%) | 105 (17.36%) |
| Austria | 685 | 570 | 106 (15.47%) | 8 ( 1.17%) | 1 (0.15%) | 0 (0.00%) | 0 (0.00%) | 115 (16.79%) |
| Bahrain | 211 | 161 | 48 (22.75%) | 1 ( 0.47%) | 1 (0.47%) | 0 (0.00%) | 0 (0.00%) | 50 (23.70%) |
| Bangladesh | 536 | 335 | 186 (34.70%) | 0 ( 0.00%) | 0 (0.00%) | 14 (2.61%) | 1 (0.19%) | 201 (37.50%) |
| Belgium | 331 | 272 | 42 (12.69%) | 14 ( 4.23%) | 2 (0.60%) | 1 (0.30%) | 0 (0.00%) | 59 (17.82%) |
| Bolivia | 341 | 279 | 59 (17.30%) | 0 ( 0.00%) | 2 (0.59%) | 1 (0.29%) | 0 (0.00%) | 62 (18.18%) |
| Bosnia-Herzegovina | 642 | 486 | 144 (22.43%) | 11 ( 1.71%) | 1 (0.16%) | 0 (0.00%) | 0 (0.00%) | 156 (24.30%) |
| Brazil | 2094 | 1809 | 241 (11.51%) | 30 ( 1.43%) | 1 (0.05%) | 13 (0.62%) | 0 (0.00%) | 285 (13.61%) |
| Bulgaria | 393 | 324 | 67 (17.05%) | 1 ( 0.25%) | 0 (0.00%) | 1 (0.25%) | 0 (0.00%) | 69 (17.56%) |
| Canada | 874 | 707 | 148 (16.93%) | 16 ( 1.83%) | 1 (0.11%) | 2 (0.23%) | 0 (0.00%) | 167 (19.11%) |
| Chad | 192 | 115 | 73 (38.02%) | 0 ( 0.00%) | 2 (1.04%) | 2 (1.04%) | 0 (0.00%) | 77 (40.10%) |
| Chile | 240 | 207 | 30 (12.50%) | 3 ( 1.25%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 33 (13.75%) |
| China | 2523 | 1018 | 215 ( 8.52%) | 1277 (50.61%) | 0 (0.00%) | 13 (0.52%) | 0 (0.00%) | 1505 (59.65%) |
| Croatia | 455 | 349 | 99 (21.76%) | 7 ( 1.54%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 106 (23.30%) |
| Cyprus | 218 | 173 | 41 (18.81%) | 0 ( 0.00%) | 0 (0.00%) | 4 (1.83%) | 0 (0.00%) | 45 (20.64%) |
| Czech Republic | 267 | 202 | 63 (23.60%) | 1 ( 0.37%) | 1 (0.37%) | 0 (0.00%) | 0 (0.00%) | 65 (24.34%) |
| Denmark | 338 | 283 | 47 (13.91%) | 7 ( 2.07%) | 0 (0.00%) | 1 (0.30%) | 0 (0.00%) | 55 (16.27%) |
| Ecuador | 1075 | 954 | 94 ( 8.74%) | 7 ( 0.65%) | 11 (1.02%) | 9 (0.84%) | 0 (0.00%) | 121 (11.26%) |
| Egypt | 869 | 630 | 221 (25.43%) | 5 ( 0.58%) | 9 (1.04%) | 4 (0.46%) | 0 (0.00%) | 239 (27.50%) |
| Estonia | 2402 | 1903 | 480 (19.98%) | 4 ( 0.17%) | 3 (0.12%) | 12 (0.50%) | 0 (0.00%) | 499 (20.77%) |
| Ethiopia | 552 | 403 | 141 (25.54%) | 1 ( 0.18%) | 4 (0.72%) | 3 (0.54%) | 0 (0.00%) | 149 (26.99%) |
| Finland | 275 | 241 | 24 ( 8.73%) | 7 ( 2.55%) | 1 (0.36%) | 2 (0.73%) | 0 (0.00%) | 34 (12.36%) |
| France | 1175 | 908 | 153 (13.02%) | 90 ( 7.66%) | 2 (0.17%) | 21 (1.79%) | 1 (0.09%) | 267 (22.72%) |
| Georgia | 504 | 371 | 126 (25.00%) | 6 ( 1.19%) | 1 (0.20%) | 0 (0.00%) | 0 (0.00%) | 133 (26.39%) |
| Germany | 1008 | 824 | 156 (15.48%) | 21 ( 2.08%) | 1 (0.10%) | 5 (0.50%) | 1 (0.10%) | 184 (18.25%) |
| Greece | 532 | 444 | 81 (15.23%) | 3 ( 0.56%) | 2 (0.38%) | 2 (0.38%) | 0 (0.00%) | 88 (16.54%) |
| Hong Kong | 237 | 176 | 45 (18.99%) | 6 ( 2.53%) | 10 (4.22%) | 0 (0.00%) | 0 (0.00%) | 61 (25.74%) |
| Hungary | 735 | 555 | 169 (22.99%) | 5 ( 0.68%) | 3 (0.41%) | 3 (0.41%) | 0 (0.00%) | 180 (24.49%) |
| India | 1627 | 1225 | 315 (19.36%) | 26 ( 1.60%) | 21 (1.29%) | 34 (2.09%) | 6 (0.37%) | 402 (24.71%) |
| Indonesia | 1501 | 1223 | 250 (16.66%) | 13 ( 0.87%) | 3 (0.20%) | 8 (0.53%) | 4 (0.27%) | 278 (18.52%) |
| Iran | 292 | 216 | 75 (25.68%) | 0 ( 0.00%) | 0 (0.00%) | 1 (0.34%) | 0 (0.00%) | 76 (26.03%) |
| Ireland | 1661 | 1526 | 110 ( 6.62%) | 10 ( 0.60%) | 12 (0.72%) | 3 (0.18%) | 0 (0.00%) | 135 ( 8.13%) |
| Israel | 437 | 353 | 75 (17.16%) | 8 ( 1.83%) | 0 (0.00%) | 1 (0.23%) | 0 (0.00%) | 84 (19.22%) |
| Italy | 566 | 489 | 66 (11.66%) | 7 ( 1.24%) | 1 (0.18%) | 3 (0.53%) | 0 (0.00%) | 77 (13.60%) |
| Japan | 549 | 431 | 76 (13.84%) | 36 ( 6.56%) | 0 (0.00%) | 5 (0.91%) | 1 (0.18%) | 118 (21.49%) |
| Kazakhstan | 787 | 676 | 91 (11.56%) | 11 ( 1.40%) | 8 (1.02%) | 1 (0.13%) | 0 (0.00%) | 111 (14.10%) |
| Kosovo | 1373 | 994 | 359 (26.15%) | 4 ( 0.29%) | 12 (0.87%) | 4 (0.29%) | 0 (0.00%) | 379 (27.60%) |
| Kuwait | 315 | 241 | 69 (21.90%) | 2 ( 0.63%) | 2 (0.63%) | 1 (0.32%) | 0 (0.00%) | 74 (23.49%) |
| Kyrgyzstan | 375 | 274 | 74 (19.73%) | 22 ( 5.87%) | 1 (0.27%) | 4 (1.07%) | 0 (0.00%) | 101 (26.93%) |
| Latvia | 1023 | 806 | 206 (20.14%) | 6 ( 0.59%) | 2 (0.20%) | 3 (0.29%) | 0 (0.00%) | 217 (21.21%) |
| Lebanon | 416 | 322 | 84 (20.19%) | 0 ( 0.00%) | 3 (0.72%) | 7 (1.68%) | 0 (0.00%) | 94 (22.60%) |
| Madagascar | 169 | 145 | 22 (13.02%) | 0 ( 0.00%) | 0 (0.00%) | 2 (1.18%) | 0 (0.00%) | 24 (14.20%) |
| Malaysia | 816 | 706 | 99 (12.13%) | 2 ( 0.25%) | 1 (0.12%) | 7 (0.86%) | 1 (0.12%) | 110 (13.48%) |
| Mexico | 1164 | 1062 | 84 ( 7.22%) | 10 ( 0.86%) | 1 (0.09%) | 6 (0.52%) | 1 (0.09%) | 102 ( 8.76%) |
| Moldova | 511 | 398 | 100 (19.57%) | 3 ( 0.59%) | 4 (0.78%) | 5 (0.98%) | 1 (0.20%) | 113 (22.11%) |
| Mongolia | 367 | 261 | 100 (27.25%) | 0 ( 0.00%) | 6 (1.63%) | 0 (0.00%) | 0 (0.00%) | 106 (28.88%) |
| Montenegro | 358 | 301 | 45 (12.57%) | 4 ( 1.12%) | 7 (1.96%) | 1 (0.28%) | 0 (0.00%) | 57 (15.92%) |
| Morocco | 302 | 231 | 61 (20.20%) | 3 ( 0.99%) | 1 (0.33%) | 6 (1.99%) | 0 (0.00%) | 71 (23.51%) |
| Mozambique | 154 | 122 | 31 (20.13%) | 0 ( 0.00%) | 0 (0.00%) | 1 (0.65%) | 0 (0.00%) | 32 (20.78%) |
| Netherlands | 448 | 353 | 79 (17.63%) | 14 ( 3.12%) | 1 (0.22%) | 1 (0.22%) | 0 (0.00%) | 95 (21.21%) |
| Nigeria | 721 | 636 | 75 (10.40%) | 0 ( 0.00%) | 8 (1.11%) | 2 (0.28%) | 0 (0.00%) | 85 (11.79%) |
| North Macedonia | 268 | 230 | 37 (13.81%) | 0 ( 0.00%) | 1 (0.37%) | 0 (0.00%) | 0 (0.00%) | 38 (14.18%) |
| Norway | 509 | 408 | 90 (17.68%) | 11 ( 2.16%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 101 (19.84%) |
| Oman | 520 | 413 | 100 (19.23%) | 1 ( 0.19%) | 3 (0.58%) | 2 (0.38%) | 1 (0.19%) | 107 (20.58%) |
| Pakistan | 507 | 401 | 94 (18.54%) | 2 ( 0.39%) | 1 (0.20%) | 9 (1.78%) | 0 (0.00%) | 106 (20.91%) |
| Paraguay | 205 | 162 | 42 (20.49%) | 1 ( 0.49%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 43 (20.98%) |
| Peru | 1031 | 868 | 87 ( 8.44%) | 48 ( 4.66%) | 4 (0.39%) | 23 (2.23%) | 1 (0.10%) | 163 (15.81%) |
| Philippines | 3556 | 2636 | 769 (21.63%) | 17 ( 0.48%) | 120 (3.37%) | 12 (0.34%) | 2 (0.06%) | 920 (25.87%) |
| Poland | 1288 | 1024 | 250 (19.41%) | 8 ( 0.62%) | 3 (0.23%) | 3 (0.23%) | 0 (0.00%) | 264 (20.50%) |
| Portugal | 579 | 451 | 118 (20.38%) | 7 ( 1.21%) | 0 (0.00%) | 3 (0.52%) | 0 (0.00%) | 128 (22.11%) |
| Qatar | 526 | 397 | 113 (21.48%) | 5 ( 0.95%) | 11 (2.09%) | 0 (0.00%) | 0 (0.00%) | 129 (24.52%) |
| Republic of Korea | 492 | 425 | 27 ( 5.49%) | 34 ( 6.91%) | 1 (0.20%) | 5 (1.02%) | 0 (0.00%) | 67 (13.62%) |
| Romania | 861 | 676 | 174 (20.21%) | 10 ( 1.16%) | 0 (0.00%) | 1 (0.12%) | 0 (0.00%) | 185 (21.49%) |
| Russia | 1322 | 1168 | 73 ( 5.52%) | 51 ( 3.86%) | 17 (1.29%) | 12 (0.91%) | 1 (0.08%) | 154 (11.65%) |
| Saudi Arabia | 296 | 260 | 27 ( 9.12%) | 6 ( 2.03%) | 1 (0.34%) | 2 (0.68%) | 0 (0.00%) | 36 (12.16%) |
| Senegal | 211 | 142 | 66 (31.28%) | 0 ( 0.00%) | 1 (0.47%) | 2 (0.95%) | 0 (0.00%) | 69 (32.70%) |
| Serbia | 420 | 324 | 89 (21.19%) | 4 ( 0.95%) | 3 (0.71%) | 0 (0.00%) | 0 (0.00%) | 96 (22.86%) |
| Singapore | 298 | 239 | 19 ( 6.38%) | 39 (13.09%) | 0 (0.00%) | 1 (0.34%) | 0 (0.00%) | 59 (19.80%) |
| Slovakia | 724 | 517 | 196 (27.07%) | 3 ( 0.41%) | 5 (0.69%) | 3 (0.41%) | 0 (0.00%) | 207 (28.59%) |
| Slovenia | 746 | 584 | 154 (20.64%) | 5 ( 0.67%) | 1 (0.13%) | 2 (0.27%) | 0 (0.00%) | 162 (21.72%) |
| South Africa | 279 | 233 | 45 (16.13%) | 0 ( 0.00%) | 0 (0.00%) | 1 (0.36%) | 0 (0.00%) | 46 (16.49%) |
| Spain | 729 | 614 | 104 (14.27%) | 6 ( 0.82%) | 3 (0.41%) | 2 (0.27%) | 0 (0.00%) | 115 (15.78%) |
| Sweden | 1149 | 824 | 266 (23.15%) | 53 ( 4.61%) | 0 (0.00%) | 6 (0.52%) | 0 (0.00%) | 325 (28.29%) |
| Switzerland | 823 | 668 | 139 (16.89%) | 14 ( 1.70%) | 0 (0.00%) | 2 (0.24%) | 0 (0.00%) | 155 (18.83%) |
| Taiwan | 201 | 146 | 36 (17.91%) | 17 ( 8.46%) | 0 (0.00%) | 2 (1.00%) | 0 (0.00%) | 55 (27.36%) |
| Thailand | 440 | 375 | 59 (13.41%) | 4 ( 0.91%) | 0 (0.00%) | 2 (0.45%) | 0 (0.00%) | 65 (14.77%) |
| Timor-Leste | 277 | 144 | 129 (46.57%) | 0 ( 0.00%) | 1 (0.36%) | 3 (1.08%) | 0 (0.00%) | 133 (48.01%) |
| Türkiye | 682 | 487 | 171 (25.07%) | 11 ( 1.61%) | 12 (1.76%) | 1 (0.15%) | 0 (0.00%) | 195 (28.59%) |
| UAE | 336 | 228 | 86 (25.60%) | 6 ( 1.79%) | 13 (3.87%) | 3 (0.89%) | 0 (0.00%) | 108 (32.14%) |
| UK | 852 | 671 | 147 (17.25%) | 26 ( 3.05%) | 5 (0.59%) | 3 (0.35%) | 0 (0.00%) | 181 (21.24%) |
| USA | 5708 | 4242 | 1002 (17.55%) | 170 ( 2.98%) | 280 (4.91%) | 11 (0.19%) | 3 (0.05%) | 1466 (25.68%) |
| Uganda | 332 | 242 | 85 (25.60%) | 0 ( 0.00%) | 1 (0.30%) | 4 (1.20%) | 0 (0.00%) | 90 (27.11%) |
| Ukraine | 749 | 654 | 88 (11.75%) | 3 ( 0.40%) | 3 (0.40%) | 1 (0.13%) | 0 (0.00%) | 95 (12.68%) |
| Uruguay | 815 | 566 | 246 (30.18%) | 1 ( 0.12%) | 2 (0.25%) | 0 (0.00%) | 0 (0.00%) | 249 (30.55%) |
| Uzbekistan | 662 | 556 | 102 (15.41%) | 4 ( 0.60%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 106 (16.01%) |
| Yemen | 580 | 397 | 170 (29.31%) | 1 ( 0.17%) | 5 (0.86%) | 7 (1.21%) | 0 (0.00%) | 183 (31.55%) |
| Zimbabwe | 275 | 210 | 59 (21.45%) | 3 ( 1.09%) | 1 (0.36%) | 2 (0.73%) | 0 (0.00%) | 65 (23.64%) |
| Total | 69408 | 53799 | 12181 (17.55%) | 2332 ( 3.36%) | 705 (1.02%) | 365 (0.53%) | 26 (0.04%) | 15609 (22.49%) |
gt::gtsave(gt_table, "222_exclusion_table.html")
gt::gtsave(gt_table, "222_exclusion_table.docx")
pagedown::chrome_print(
"222_exclusion_table.html",
output = "222_exclusion_table.pdf"
)
# Country-level inclusion and sample sizes
country_inclusion <- df_exclusion |>
dplyr::group_by(country) |>
dplyr::summarise(
initial_number_of_participants = dplyr::n(),
valid_participants = sum(exclusion_criteria == "valid", na.rm = TRUE),
inclusion_rate = format(round(100 * valid_participants /
initial_number_of_participants, 2), nsmall = 2)
)
# Country with minimum and maximum inclusion rates
country_inclusion |>
dplyr::slice_min(inclusion_rate, n = 1, with_ties = FALSE)# A tibble: 1 × 4
country initial_number_of_participants valid_participants inclusion_rate
<chr> <int> <int> <chr>
1 China 2523 1018 40.35
# A tibble: 1 × 4
country initial_number_of_participants valid_participants inclusion_rate
<chr> <int> <int> <chr>
1 Ireland 1661 1526 91.87
# Countries with smallest and largest valid sample sizes
country_inclusion |>
dplyr::slice_min(valid_participants, n = 1, with_ties = FALSE)# A tibble: 1 × 4
country initial_number_of_participants valid_participants inclusion_rate
<chr> <int> <int> <chr>
1 Chad 192 115 59.90
# A tibble: 1 × 4
country initial_number_of_participants valid_participants inclusion_rate
<chr> <int> <int> <chr>
1 USA 5708 4242 74.32
Missing Data
# Focus on the original variables
orig_cols <- base::intersect(names(df_final), names(df_pub))
df_final_orig <- df_final |>
dplyr::select(all_of(orig_cols))
visdat::vis_miss(df_final_orig, cluster = TRUE, warn_large_data = FALSE)df_final_orig |>
dplyr::summarise(
dplyr::across(
dplyr::everything(),
\(x) 100 * mean(is.na(x), na.rm = TRUE)
),
.groups = "drop"
) |>
tidyr::pivot_longer(
cols = dplyr::everything(),
names_to = "col",
values_to = "pct_missing"
) |>
print_reactable(sorted_col = "pct_missing", width = 600)A0.3. Harmonize financial variables
Midpoint of the brackets
The midpoints of the brackets were computed as (low_bracket + high_bracket)/2 except the last bracket needs to be computed differently because it is open-ended. We computed the median ratio between the midpoints of the last bracket and the low point of the last bracket across countries where the low point of the last bracket was lower than the maximum open text answer provided by participants. This median ratio was then used to compute the midpoint of the last bracket in countries where the low point of the last bracket was higher than the maximum open text answer provided by participants.
# Calculate midpoints for all brackets except the last one
income_brackets <- income_recoded |>
dplyr::mutate(
income_midpoint = dplyr::case_when(
income_orig == 9 ~ NA_real_,
TRUE ~ (income_lowpoint + income_highpoint) / 2
)
) |>
dplyr::select(UserLanguage, income_orig, income_midpoint) |>
dplyr::glimpse(width = 100)Rows: 1,125
Columns: 3
$ UserLanguage <chr> "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB…
$ income_orig <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7,…
$ income_midpoint <dbl> 60000.0, 160000.0, 250000.0, 355000.0, 515000.0, 725000.0, 1040000.0, 1625…
[1] 53799
df_final <- df_final |>
dplyr::left_join(income_brackets, by = c("UserLanguage", "income_orig"))
nrow(df_final)[1] 53799
# Calculate midpoints for the last bracket
midpoints_last <- df_final |>
# There are no income values from the open text field in Taiwan
dplyr::filter(country != "Taiwan") |>
dplyr::group_by(country) |>
dplyr::summarise(
# Ireland have different lowpoints for the last bracket
# in the main dataset and the sponsored dataset.
max_income_lowpoint = base::max(income_lowpoint_9, na.rm = TRUE),
max_income_text = base::max(income_text_reviewed, na.rm = TRUE),
income_midpoint_last =
base::mean(c(max_income_lowpoint, max_income_text), na.rm = TRUE),
bracket_higher_than_text =
!is.na(max_income_text) & max_income_lowpoint > max_income_text,
ratio = income_midpoint_last/max_income_lowpoint
) |>
dplyr::glimpse(width = 100)Rows: 91
Columns: 6
$ country <chr> "Albania", "Algeria", "Angola", "Argentina", "Armenia", "Australi…
$ max_income_lowpoint <dbl> 2000000, 200000, 1450000001, 2400000, 1200001, 250000, 93163, 230…
$ max_income_text <dbl> 2000000, 400000, 200000000, 40000000, 4000000, 290000, 200000, 55…
$ income_midpoint_last <dbl> 2000000.0, 300000.0, 825000000.5, 21200000.0, 2600000.5, 270000.0…
$ bracket_higher_than_text <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
$ ratio <dbl> 1.0000000, 1.5000000, 0.5689655, 8.8333333, 2.1666653, 1.0800000,…
ratio_last_bracket <- midpoints_last |>
dplyr::filter(!bracket_higher_than_text) |>
dplyr::summarise(median_ratio = stats::median(ratio, na.rm = TRUE)) |>
dplyr::pull(median_ratio); ratio_last_bracket[1] 2
[1] 53799
df_final <- df_final |>
dplyr::left_join(midpoints_last |> dplyr::select(country, bracket_higher_than_text, max_income_lowpoint, income_midpoint_last), by = "country")
nrow(df_final)[1] 53799
# Update income_midpoint for the last bracket
df_final <-
df_final |>
dplyr::mutate(
income_midpoint = dplyr::case_when(
income_orig == 9 & !bracket_higher_than_text ~ income_midpoint_last,
income_orig == 9 & bracket_higher_than_text ~ max_income_lowpoint * ratio_last_bracket,
income_orig == 9 & country == "Taiwan"
~ income_lowpoint_9 * ratio_last_bracket,
TRUE ~ income_midpoint
)
)
# Sanity check: View last midpoints
df_final |>
dplyr::filter(income_orig == 9) |>
dplyr::group_by(country, income_orig, income_lowpoint_9, income_midpoint) |>
dplyr::summarise() |>
dplyr::arrange(country) |>
print_reactable(sorted_col = "income_midpoint", width = 800)Convert monthly values to annual
We created a variable that combined midpoints from brackets with the open field answers and converted monthly values to annual values.
The survey versions for Bahrain and Pakistan requested for annual income in their native language, but monthly in the English version. If the distribution of responses is similar, we will retain the transformed monthly values.
annual monthly <NA>
28802 24997 0
annual monthly <NA>
12 0 21935 0
12.5 0 241 0
13 0 2821 0
<NA> 28802 0 0
df_final <- df_final |>
dplyr::mutate(
income_cont = dplyr::case_when(
income_orig == 0 ~ 0,
income_orig == 10 ~ income_text_reviewed,
!is.na(income_midpoint) ~ income_midpoint,
TRUE ~ NA_real_
),
income_cont_nozero = dplyr::case_when(
income_orig == 10 & income_text_reviewed > 0 ~ income_text_reviewed,
!is.na(income_midpoint) ~ income_midpoint,
TRUE ~ NA_real_
),
income_annual = dplyr::case_when(
income_period == "monthly" ~ income_cont_nozero * wages_per_year,
TRUE ~ income_cont_nozero
)
)
df_final |> dplyr::group_by(country, income_orig, income_merg, income_text_reviewed,
income_merg_translated, income_cont, income_midpoint,
income_cont_nozero, income_annual, income_period) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(country, income_orig) |>
print_reactable(sorted_col = "country", width = 900)Convert 2025 income values to 2024
The inflation2024_factor was calculated with Consumer Price Index (CPI): CPI 2025 / CPI 2024
If national reports only provided the percentage of inflation change, then the factor was calculated as 1 + (percentage inflation change / 100). Then, to convert 2025 income to 2024 values we divided 2025 income by this factor.
2024 2025 <NA>
50097 3702 0
0.966 1.001 1.0021 1.0182 1.0399 1.0415 1.045 1.046 1.073 1.087 1.118 <NA>
115 293 241 500 371 122 230 324 274 556 676 50097
Income net and gross
Collaborators from Kuwait, Oman, and Saudi Arabia confirmed that the income values do not require transformation because there is no income tax in these countries.
The survey version for Belgium requested net income in the Dutch version, and gross income in the French version.
Since the calculation of social contribution and tax deduction is not the same for all countries, we will remove the countries that asked for net income.
We will not apply tax brackets for Zimbabwe, since the system changes throughout the year due to inflation.
Converting all financial values to USD
df_final <- df_final |>
dplyr::mutate(
# We have already set values of 0 as NA earlier
income_annual_24_gross_USD = income_annual_24_gross *
one_local_unit_to_USD_conversion,
# If assets or debts are 0, set to NA
assets_USD = base::ifelse(
is.na(assets_reviewed) | assets_reviewed == 0,
NA_real_,
assets_reviewed * one_local_unit_to_USD_conversion
),
debts_USD = base::ifelse(
is.na(debts_reviewed) | debts_reviewed == 0,
NA_real_,
debts_reviewed * one_local_unit_to_USD_conversion
)
)Calculating z-scores and percentiles for assets and debts
# Sanity check: View means and sds
df_final |>
dplyr::group_by(country, income_type) |>
dplyr::summarise(
n_income = base::round(base::sum(!is.na(income_annual_24_gross_USD)), 2),
mean_income = base::round(base::mean(income_annual_24_gross_USD, na.rm = TRUE), 2),
sd_income = base::round(stats::sd(income_annual_24_gross_USD, na.rm = TRUE), 2),
n_assets = base::round(base::sum(!is.na(assets_USD)), 2),
mean_assets = base::round(base::mean(assets_USD, na.rm = TRUE), 2),
sd_assets = base::round(stats::sd(assets_USD, na.rm = TRUE), 2),
n_debts = base::round(base::sum(!is.na(debts_USD)), 2),
mean_debts = base::round(base::mean(debts_USD, na.rm = TRUE), 2),
sd_debts = base::round(stats::sd(debts_USD, na.rm = TRUE), 2)
) |>
print_reactable(sorted_col = "country", width = 900)df_final <- df_final |>
dplyr::group_by(country) |>
dplyr::mutate(
income_USD_z_local = base::ifelse(
!is.na(income_annual_24_gross_USD),
(income_annual_24_gross_USD -
base::mean(income_annual_24_gross_USD, na.rm = TRUE))
/ stats::sd(income_annual_24_gross_USD, na.rm = TRUE),
NA_real_
),
assets_USD_z_local = base::ifelse(
!is.na(assets_USD),
(assets_USD - base::mean(assets_USD, na.rm = TRUE))
/ stats::sd(assets_USD, na.rm = TRUE),
NA_real_
),
debts_USD_z_local = base::ifelse(
!is.na(debts_USD),
(debts_USD - base::mean(debts_USD, na.rm = TRUE))
/ stats::sd(debts_USD, na.rm = TRUE),
NA_real_
),
income_USD_percentile_local = base::ifelse(
!is.na(income_annual_24_gross_USD),
dplyr::percent_rank(income_annual_24_gross_USD),
NA_real_
),
assets_USD_percentile_local = base::ifelse(
!is.na(assets_USD),
dplyr::percent_rank(assets_USD),
NA_real_
),
debts_USD_percentile_local = base::ifelse(
!is.na(debts_USD),
dplyr::percent_rank(debts_USD),
NA_real_
)
) |>
dplyr::ungroup() |>
dplyr::mutate(
income_USD_z_full = base::ifelse(
!is.na(income_annual_24_gross_USD),
(income_annual_24_gross_USD -
base::mean(income_annual_24_gross_USD, na.rm = TRUE))
/ stats::sd(income_annual_24_gross_USD, na.rm = TRUE),
NA_real_
),
assets_USD_z_full = base::ifelse(
!is.na(assets_USD),
(assets_USD - base::mean(assets_USD, na.rm = TRUE))
/ stats::sd(assets_USD, na.rm = TRUE),
NA_real_
),
debts_USD_z_full = base::ifelse(
!is.na(debts_USD),
(debts_USD - base::mean(debts_USD, na.rm = TRUE))
/ stats::sd(debts_USD, na.rm = TRUE),
NA_real_
),
income_USD_percentile_full = base::ifelse(
!is.na(income_annual_24_gross_USD),
dplyr::percent_rank(income_annual_24_gross_USD),
NA_real_
),
assets_USD_percentile_full = base::ifelse(
!is.na(assets_USD),
dplyr::percent_rank(assets_USD),
NA_real_
),
debts_USD_percentile_full = base::ifelse(
!is.na(debts_USD),
dplyr::percent_rank(debts_USD),
NA_real_
)
)
(df_gmh |> filter(is.na(income_cont)) |> nrow()) +
(df_gmh |> filter(income_cont == 0) |> nrow())[1] 2664
[1] 2664
[1] 2664
(df_gmh |> filter(is.na(income_annual_24) & income_type != "net") |> nrow()) +
(df_gmh |> filter(income_type == "net") |> nrow())[1] 13180
[1] 13180
[1] 13180
A0.4. Weights data
For Moldova, Romania, Nigeria, Montenegro, Angola, Morocco, Uruguay, Paraguay, Greece, Iran, Hungary, Kosovo, Yemen, Chile, and Uganda, values of 1 were used instead of weighted scores.
# Load weights computed based on age, education, sex, and country
weights <- base::readRDS("444_weighted_data.RDS")
# Sanity check: View participants without weights due to missing sociodemographics
weights |>
dplyr::filter(is.na(ps_weight), !is.na(education_recoded_cat)) |>
dplyr::select(ResponseId, country, age, sex_binary_cat, education_recoded_cat) |>
print_reactable(sorted_col = "country", width = 600)[1] 53799
df_gmh <- df_final |>
dplyr::left_join( weights |> dplyr::select(ResponseId, ps_weight), by = "ResponseId")
nrow(df_gmh)[1] 53799
# For a set of countries, recode the weight score to 1. Also recode NA to 1.
df_gmh <- df_gmh %>%
mutate(ps_weight = base::ifelse(
country %in% flagged_countries,
1,
ps_weight),
ps_weight_flag = base::ifelse(
country %in% flagged_countries,
1, 0
),
ps_weight_na = base::ifelse(
country %in% flagged_countries,
NA_real_, ps_weight
),
ps_weight = base::ifelse(is.na(ps_weight), 1, ps_weight)
)
# Sanity check: How many missing values in weights after transforming those to 1?
df_gmh |>
dplyr::summarise(
n_missing_weights = base::sum(is.na(ps_weight_na)),
perc_missing_weights = (n_missing_weights / dplyr::n()) * 100
) |> base::nrow()[1] 1
A0.5. Calculate Factor Scores
Global Factor Scores
fit_mpwb <- lavaan::cfa(
'mpwb =~ mpwb_competence + mpwb_emotional_stability + mpwb_engagement + mpwb_meaning + mpwb_optimism + mpwb_positive_emotion + mpwb_positive_relationships + mpwb_resilience + mpwb_self_esteem + mpwb_vitality',
data = df_gmh,
std.lv = TRUE,
estimator = "MLR",
sampling.weights = "ps_weight"
)
# We don't have any missing case on any of these variables
# and lavPredict keeps the same row order according to their manual
factor_scores <- lavaan::lavPredict(fit_mpwb, type = "lv")
df_gmh$mpwb_factor_global <- factor_scores[,1]
# View loadings
summary(fit_mpwb, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)lavaan 0.6-19 ended normally after 15 iterations
Estimator ML
Optimization method NLMINB
Number of model parameters 20
Number of observations 53799
Sampling weights variable ps_weight
Model Test User Model:
Standard Scaled
Test Statistic 11658.139 4049.674
Degrees of freedom 35 35
P-value (Chi-square) 0.000 0.000
Scaling correction factor 2.879
Yuan-Bentler correction (Mplus variant)
Model Test Baseline Model:
Test statistic 281635.351 94343.306
Degrees of freedom 45 45
P-value 0.000 0.000
Scaling correction factor 2.985
User Model versus Baseline Model:
Comparative Fit Index (CFI) 0.959 0.957
Tucker-Lewis Index (TLI) 0.947 0.945
Robust Comparative Fit Index (CFI) 0.959
Robust Tucker-Lewis Index (TLI) 0.947
Loglikelihood and Information Criteria:
Loglikelihood user model (H0) -840221.377 -840221.377
Scaling correction factor 2.624
for the MLR correction
Loglikelihood unrestricted model (H1) -834392.308 -834392.308
Scaling correction factor 2.786
for the MLR correction
Akaike (AIC) 1680482.755 1680482.755
Bayesian (BIC) 1680660.615 1680660.615
Sample-size adjusted Bayesian (SABIC) 1680597.055 1680597.055
Root Mean Square Error of Approximation:
RMSEA 0.079 0.046
90 Percent confidence interval - lower 0.077 0.045
90 Percent confidence interval - upper 0.080 0.047
P-value H_0: RMSEA <= 0.050 0.000 1.000
P-value H_0: RMSEA >= 0.080 0.025 0.000
Robust RMSEA 0.078
90 Percent confidence interval - lower 0.076
90 Percent confidence interval - upper 0.080
P-value H_0: Robust RMSEA <= 0.050 0.000
P-value H_0: Robust RMSEA >= 0.080 0.092
Standardized Root Mean Square Residual:
SRMR 0.031 0.031
Parameter Estimates:
Standard errors Sandwich
Information bread Observed
Observed information based on Hessian
Latent Variables:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
mpwb =~
mpwb_competenc 1.064 0.009 124.883 0.000 1.064 0.737
mpwb_mtnl_stbl 1.123 0.008 139.341 0.000 1.123 0.743
mpwb_engagemnt 0.773 0.009 85.772 0.000 0.773 0.577
mpwb_meaning 1.127 0.009 130.748 0.000 1.127 0.757
mpwb_optimism 1.213 0.008 145.537 0.000 1.213 0.763
mpwb_postv_mtn 1.202 0.008 155.756 0.000 1.202 0.821
mpwb_pstv_rltn 0.743 0.010 76.452 0.000 0.743 0.508
mpwb_resilienc 0.970 0.008 114.688 0.000 0.970 0.654
mpwb_self_estm 1.196 0.008 151.048 0.000 1.196 0.800
mpwb_vitality 1.189 0.008 153.797 0.000 1.189 0.762
Variances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
.mpwb_competenc 0.951 0.012 80.689 0.000 0.951 0.457
.mpwb_mtnl_stbl 1.023 0.012 84.746 0.000 1.023 0.448
.mpwb_engagemnt 1.194 0.013 90.907 0.000 1.194 0.667
.mpwb_meaning 0.949 0.012 76.064 0.000 0.949 0.428
.mpwb_optimism 1.059 0.013 79.653 0.000 1.059 0.418
.mpwb_postv_mtn 0.698 0.010 67.005 0.000 0.698 0.326
.mpwb_pstv_rltn 1.583 0.017 95.070 0.000 1.583 0.742
.mpwb_resilienc 1.262 0.013 99.437 0.000 1.262 0.573
.mpwb_self_estm 0.804 0.011 71.741 0.000 0.804 0.360
.mpwb_vitality 1.024 0.012 83.341 0.000 1.024 0.420
mpwb 1.000 1.000 1.000
R-Square:
Estimate
mpwb_competenc 0.543
mpwb_mtnl_stbl 0.552
mpwb_engagemnt 0.333
mpwb_meaning 0.572
mpwb_optimism 0.582
mpwb_postv_mtn 0.674
mpwb_pstv_rltn 0.258
mpwb_resilienc 0.427
mpwb_self_estm 0.640
mpwb_vitality 0.580
[1] 0.995
[1] 0.995
r t p
1 0.995 21956.04 <.001
r t p
1 0.995 21874.82 <.001
# Sanity check: How many missing values in global factor scores?
df_gmh |>
dplyr::filter(is.na(mpwb_factor_global)) |>
base::nrow()[1] 0
Within Country Factor Scores
# Split data by country
country_list <- base::split(df_gmh, df_gmh$country)
# For each country we will fit CFA and extract scores
country_scores <- lapply(country_list, function(country_data) {
fit <- lavaan::cfa(
'mpwb =~ mpwb_competence + mpwb_emotional_stability + mpwb_engagement + mpwb_meaning + mpwb_optimism + mpwb_positive_emotion + mpwb_positive_relationships + mpwb_resilience + mpwb_self_esteem + mpwb_vitality',
data = country_data,
std.lv = TRUE,
estimator = "MLR",
sampling.weights = "ps_weight"
)
factor_scores <- lavaan::lavPredict(fit, type = "lv")[, 1]
country_data$mpwb_factor_within <- factor_scores
return(country_data)
})
# Recombine all countries
df_gmh <- dplyr::bind_rows(country_scores)
# Sanity check
df_gmh |>
dplyr::group_by(country) |>
dplyr::group_modify(~ {
tibble::tibble(
n = base::nrow(.x),
mean_factor_within = base::round(base::mean(.x$mpwb_factor_within, na.rm = TRUE), 2),
sd_factor_within = base::round(stats::sd(.x$mpwb_factor_within, na.rm = TRUE), 2),
cor_mpwb_sum = base::round(
stats::cor(.x$mpwb_factor_within, .x$mpwb_sum, use = "complete.obs"),
3
),
cor_mpwb_mean = base::round(
stats::cor(.x$mpwb_factor_within, .x$mpwb_mean, use = "complete.obs"),
3
),
cor_mpwb_sum_wt = weighted_corr(.x, mpwb_factor_within, mpwb_sum)[[1]],
cor_mpwb_mean_wt = weighted_corr(.x, mpwb_factor_within, mpwb_mean)[[1]]
)
}) |>
dplyr::ungroup() |>
print_reactable(sorted_col = "country", width = 500)# Sanity check: How many missing values in factor scores?
df_gmh |>
dplyr::filter(is.na(mpwb_factor_within)) |>
base::nrow()[1] 0
A0.6 Saving data
# Write labels from codebook to df_gmh
codebook_label <- codebook |>
dplyr::select(variable, label) |>
(\(x) { stats::setNames(x$label, x$variable) })()
for (v in names(codebook_label)) {
labelled::var_label(df_gmh[[v]]) <- codebook_label[[v]]
}
# Save cleaned data
saveRDS(df_gmh, "999_cleaned_data.rds")
write.csv(df_gmh, "999_cleaned_data.csv", row.names = FALSE)
rm(v, codebook_label)Information About the R Session
─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
setting value
version R version 4.5.2 (2025-10-31)
os macOS Sequoia 15.6
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Lisbon
date 2025-12-26
pandoc 3.6.3 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
quarto 1.4.549 @ /usr/local/bin/quarto
─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
abind 1.4-8 2024-09-12 [1] CRAN (R 4.5.0)
archive 1.1.12 2025-03-20 [1] CRAN (R 4.5.0)
askpass 1.2.1 2024-10-04 [1] CRAN (R 4.5.0)
backports 1.5.0 2024-05-23 [1] CRAN (R 4.5.0)
base64enc 0.1-3 2015-07-28 [1] CRAN (R 4.5.0)
binom * 1.1-1.1 2022-05-02 [1] CRAN (R 4.5.0)
bit 4.6.0 2025-03-06 [1] CRAN (R 4.5.0)
bit64 4.6.0-1 2025-01-16 [1] CRAN (R 4.5.0)
boot 1.3-32 2025-08-29 [1] CRAN (R 4.5.2)
broom 1.0.9 2025-07-28 [1] CRAN (R 4.5.0)
broom.mixed * 0.2.9.6 2024-10-15 [1] CRAN (R 4.5.0)
car * 3.1-3 2024-09-27 [1] CRAN (R 4.5.0)
carData * 3.0-5 2022-01-06 [1] CRAN (R 4.5.0)
cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.5.0)
checkmate 2.3.2 2024-07-29 [1] CRAN (R 4.5.0)
class 7.3-23 2025-01-01 [1] CRAN (R 4.5.2)
classInt 0.4-11 2025-01-08 [1] CRAN (R 4.5.0)
cli 3.6.5 2025-04-23 [1] CRAN (R 4.5.0)
cluster 2.1.8.1 2025-03-12 [1] CRAN (R 4.5.2)
coda 0.19-4.1 2024-01-31 [1] CRAN (R 4.5.0)
codetools 0.2-20 2024-03-31 [1] CRAN (R 4.5.2)
colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.5.0)
corrplot * 0.95 2024-10-14 [1] CRAN (R 4.5.0)
countrycode * 1.6.1 2025-03-31 [1] CRAN (R 4.5.0)
cowplot * 1.2.0 2025-07-07 [1] CRAN (R 4.5.0)
crayon 1.5.3 2024-06-20 [1] CRAN (R 4.5.0)
crosstalk 1.2.1 2023-11-23 [1] CRAN (R 4.5.0)
curl 7.0.0 2025-08-19 [1] CRAN (R 4.5.0)
data.table 1.17.8 2025-07-10 [1] CRAN (R 4.5.0)
DBI 1.2.3 2024-06-02 [1] CRAN (R 4.5.0)
digest 0.6.37 2024-08-19 [1] CRAN (R 4.5.0)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.5.0)
e1071 1.7-16 2024-09-16 [1] CRAN (R 4.5.0)
emmeans * 1.11.2 2025-07-11 [1] CRAN (R 4.5.0)
estimability 1.5.1 2024-05-12 [1] CRAN (R 4.5.0)
evaluate 1.0.4 2025-06-18 [1] CRAN (R 4.5.0)
farver 2.1.2 2024-05-13 [1] CRAN (R 4.5.0)
fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.5.0)
flextable * 0.9.10 2025-08-24 [1] CRAN (R 4.5.0)
fontBitstreamVera 0.1.1 2017-02-01 [1] CRAN (R 4.5.0)
fontLiberation 0.1.0 2016-10-15 [1] CRAN (R 4.5.0)
fontquiver 0.2.1 2017-02-01 [1] CRAN (R 4.5.0)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.5.0)
foreach 1.5.2 2022-02-02 [1] CRAN (R 4.5.0)
foreign 0.8-90 2025-03-31 [1] CRAN (R 4.5.2)
Formula 1.2-5 2023-02-24 [1] CRAN (R 4.5.0)
fs 1.6.6 2025-04-12 [1] CRAN (R 4.5.0)
furrr 0.3.1 2022-08-15 [1] CRAN (R 4.5.0)
future 1.67.0 2025-07-29 [1] CRAN (R 4.5.0)
gdata 3.0.1 2024-10-22 [1] CRAN (R 4.5.0)
gdtools 0.4.4 2025-10-06 [1] CRAN (R 4.5.0)
generics 0.1.4 2025-05-09 [1] CRAN (R 4.5.0)
ggeffects * 2.3.0 2025-06-13 [1] CRAN (R 4.5.0)
ggflags * 0.0.4 2023-10-10 [1] https://jimjam-slam.r-universe.dev (R 4.5.1)
ggfx * 1.0.2 2025-07-24 [1] CRAN (R 4.5.0)
ggh4x * 0.3.1 2025-05-30 [1] CRAN (R 4.5.0)
ggplot2 * 4.0.0 2025-09-11 [1] CRAN (R 4.5.0)
ggplotify * 0.1.2 2023-08-09 [1] CRAN (R 4.5.0)
ggridges * 0.5.7 2025-08-27 [1] CRAN (R 4.5.0)
ggtext * 0.1.2 2022-09-16 [1] CRAN (R 4.5.0)
glmnet 4.1-10 2025-07-17 [1] CRAN (R 4.5.0)
globals 0.18.0 2025-05-08 [1] CRAN (R 4.5.0)
glue 1.8.0 2024-09-30 [1] CRAN (R 4.5.0)
gridExtra * 2.3 2017-09-09 [1] CRAN (R 4.5.0)
gridGraphics 0.5-1 2020-12-13 [1] CRAN (R 4.5.0)
gridtext 0.1.5 2022-09-16 [1] CRAN (R 4.5.0)
gt 1.0.0 2025-04-05 [1] CRAN (R 4.5.0)
gtable * 0.3.6 2024-10-25 [1] CRAN (R 4.5.0)
gtools 3.9.5 2023-11-20 [1] CRAN (R 4.5.0)
haven 2.5.5 2025-05-30 [1] CRAN (R 4.5.0)
Hmisc * 5.2-4 2025-10-05 [1] CRAN (R 4.5.0)
hms 1.1.3 2023-03-21 [1] CRAN (R 4.5.0)
htmlTable 2.4.3 2024-07-21 [1] CRAN (R 4.5.0)
htmltools * 0.5.8.1 2024-04-04 [1] CRAN (R 4.5.0)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.5.0)
httpuv 1.6.16 2025-04-16 [1] CRAN (R 4.5.0)
insight 1.4.2 2025-09-02 [1] CRAN (R 4.5.0)
interactions * 1.2.0 2024-07-29 [1] CRAN (R 4.5.0)
iterators 1.0.14 2022-02-05 [1] CRAN (R 4.5.0)
janitor * 2.2.1 2024-12-22 [1] CRAN (R 4.5.0)
jomo 2.7-6 2023-04-15 [1] CRAN (R 4.5.0)
jsonlite 2.0.0 2025-03-27 [1] CRAN (R 4.5.0)
jtools 2.3.0 2024-08-25 [1] CRAN (R 4.5.0)
kableExtra * 1.4.0 2024-01-24 [1] CRAN (R 4.5.0)
KernSmooth 2.23-26 2025-01-01 [1] CRAN (R 4.5.2)
knitr 1.50 2025-03-16 [1] CRAN (R 4.5.0)
labeling 0.4.3 2023-08-29 [1] CRAN (R 4.5.0)
labelled * 2.16.0 2025-10-22 [1] CRAN (R 4.5.0)
later 1.4.2 2025-04-08 [1] CRAN (R 4.5.0)
lattice 0.22-7 2025-04-02 [1] CRAN (R 4.5.2)
lavaan * 0.6-19 2024-09-26 [1] CRAN (R 4.5.0)
leaflet * 2.2.2 2024-03-26 [1] CRAN (R 4.5.0)
leaflet.extras * 2.0.1 2024-08-19 [1] CRAN (R 4.5.0)
leaflet.extras2 * 1.3.2 2025-08-27 [1] CRAN (R 4.5.0)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.5.0)
listenv 0.9.1 2024-01-29 [1] CRAN (R 4.5.0)
lme4 * 1.1-37 2025-03-26 [1] CRAN (R 4.5.0)
lsr * 0.5.2 2021-12-01 [1] CRAN (R 4.5.0)
lubridate * 1.9.4 2024-12-08 [1] CRAN (R 4.5.0)
magick 2.8.7 2025-06-06 [1] CRAN (R 4.5.0)
magrittr 2.0.4 2025-09-12 [1] CRAN (R 4.5.0)
MASS 7.3-65 2025-02-28 [1] CRAN (R 4.5.2)
mathjaxr 2.0-0 2025-12-01 [1] CRAN (R 4.5.2)
Matrix * 1.7-4 2025-08-28 [1] CRAN (R 4.5.2)
metadat * 1.4-0 2025-02-04 [1] CRAN (R 4.5.0)
metafor * 4.8-0 2025-01-28 [1] CRAN (R 4.5.0)
MetBrewer * 0.2.0 2022-03-21 [1] CRAN (R 4.5.0)
mgcv * 1.9-3 2025-04-04 [1] CRAN (R 4.5.2)
mice 3.18.0 2025-05-27 [1] CRAN (R 4.5.0)
mime 0.13 2025-03-17 [1] CRAN (R 4.5.0)
minqa 1.2.8 2024-08-17 [1] CRAN (R 4.5.0)
mitml 0.4-5 2023-03-08 [1] CRAN (R 4.5.0)
mitools 2.4 2019-04-26 [1] CRAN (R 4.5.0)
mnormt 2.1.1 2022-09-26 [1] CRAN (R 4.5.0)
multcomp 1.4-28 2025-01-29 [1] CRAN (R 4.5.0)
mvtnorm 1.3-3 2025-01-10 [1] CRAN (R 4.5.0)
nlme * 3.1-168 2025-03-31 [1] CRAN (R 4.5.2)
nloptr 2.2.1 2025-03-17 [1] CRAN (R 4.5.0)
nnet 7.3-20 2025-01-01 [1] CRAN (R 4.5.2)
numDeriv * 2016.8-1.1 2019-06-06 [1] CRAN (R 4.5.0)
officer * 0.7.0 2025-09-03 [1] CRAN (R 4.5.0)
openssl 2.3.3 2025-05-26 [1] CRAN (R 4.5.0)
pacman * 0.5.1 2019-03-11 [1] CRAN (R 4.5.0)
pagedown 0.23 2025-08-20 [1] CRAN (R 4.5.0)
pan 1.9 2023-12-07 [1] CRAN (R 4.5.0)
pander 0.6.6 2025-03-01 [1] CRAN (R 4.5.0)
parallelly 1.45.1 2025-07-24 [1] CRAN (R 4.5.0)
pbivnorm 0.6.0 2015-01-23 [1] CRAN (R 4.5.0)
performance * 0.15.2 2025-10-06 [1] CRAN (R 4.5.0)
pillar 1.11.0 2025-07-04 [1] CRAN (R 4.5.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.5.0)
processx 3.8.6 2025-02-21 [1] CRAN (R 4.5.0)
promises 1.3.3 2025-05-29 [1] CRAN (R 4.5.0)
proxy 0.4-27 2022-06-09 [1] CRAN (R 4.5.0)
ps 1.9.1 2025-04-12 [1] CRAN (R 4.5.0)
psych * 2.5.6 2025-06-23 [1] CRAN (R 4.5.0)
purrr * 1.1.0 2025-07-10 [1] CRAN (R 4.5.0)
quadprog 1.5-8 2019-11-20 [1] CRAN (R 4.5.0)
qualtRics * 3.2.1 2024-08-16 [1] CRAN (R 4.5.0)
R6 2.6.1 2025-02-15 [1] CRAN (R 4.5.0)
ragg 1.4.0 2025-04-10 [1] CRAN (R 4.5.0)
rappdirs 0.3.3 2021-01-31 [1] CRAN (R 4.5.0)
rbibutils 2.3 2024-10-04 [1] CRAN (R 4.5.0)
RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.5.0)
Rcpp 1.1.0 2025-07-02 [1] CRAN (R 4.5.0)
Rdpack 2.6.4 2025-04-09 [1] CRAN (R 4.5.0)
reactable * 0.4.4 2023-03-12 [1] CRAN (R 4.5.0)
reactR 0.6.1 2024-09-14 [1] CRAN (R 4.5.0)
readr * 2.1.5 2024-01-10 [1] CRAN (R 4.5.0)
readxl * 1.4.5 2025-03-07 [1] CRAN (R 4.5.0)
reformulas 0.4.1 2025-04-30 [1] CRAN (R 4.5.0)
report * 0.6.1 2025-02-07 [1] CRAN (R 4.5.0)
rlang * 1.1.6 2025-04-11 [1] CRAN (R 4.5.0)
rmarkdown 2.29 2024-11-04 [1] CRAN (R 4.5.0)
rmcorr * 0.7.0 2024-07-26 [1] CRAN (R 4.5.0)
rnaturalearth * 1.1.0 2025-07-28 [1] CRAN (R 4.5.0)
rnaturalearthdata * 1.0.0 2024-02-09 [1] CRAN (R 4.5.0)
rpart 4.1.24 2025-01-07 [1] CRAN (R 4.5.2)
rstudioapi 0.17.1 2024-10-22 [1] CRAN (R 4.5.0)
S7 0.2.0 2024-11-07 [1] CRAN (R 4.5.0)
sandwich 3.1-1 2024-09-15 [1] CRAN (R 4.5.0)
sass 0.4.10 2025-04-11 [1] CRAN (R 4.5.0)
scales * 1.4.0 2025-04-24 [1] CRAN (R 4.5.0)
see * 0.11.0 2025-03-11 [1] CRAN (R 4.5.0)
semTools * 0.5-7 2025-03-13 [1] CRAN (R 4.5.0)
servr 0.32 2024-10-04 [1] CRAN (R 4.5.0)
sessioninfo * 1.2.3 2025-02-05 [1] CRAN (R 4.5.0)
sf * 1.0-21 2025-05-15 [1] CRAN (R 4.5.0)
shape 1.4.6.1 2024-02-23 [1] CRAN (R 4.5.0)
showtext * 0.9-7 2024-03-02 [1] CRAN (R 4.5.0)
showtextdb * 3.0 2020-06-04 [1] CRAN (R 4.5.0)
sjlabelled 1.2.0 2022-04-10 [1] CRAN (R 4.5.0)
sjPlot * 2.9.0 2025-07-10 [1] CRAN (R 4.5.0)
snakecase 0.11.1 2023-08-27 [1] CRAN (R 4.5.0)
stringi 1.8.7 2025-03-27 [1] CRAN (R 4.5.0)
stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.5.0)
survey * 4.4-8 2025-08-28 [1] CRAN (R 4.5.0)
survival * 3.8-3 2024-12-17 [1] CRAN (R 4.5.2)
svglite 2.2.1 2025-05-12 [1] CRAN (R 4.5.0)
sysfonts * 0.8.9 2024-03-02 [1] CRAN (R 4.5.0)
systemfonts 1.3.1 2025-10-01 [1] CRAN (R 4.5.0)
textshaping 1.0.1 2025-05-01 [1] CRAN (R 4.5.0)
TH.data 1.1-3 2025-01-17 [1] CRAN (R 4.5.0)
tibble * 3.3.0 2025-06-08 [1] CRAN (R 4.5.0)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.5.0)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.5.0)
timechange 0.3.0 2024-01-18 [1] CRAN (R 4.5.0)
tzdb 0.5.0 2025-03-15 [1] CRAN (R 4.5.0)
units 0.8-7 2025-03-11 [1] CRAN (R 4.5.0)
utf8 1.2.6 2025-06-08 [1] CRAN (R 4.5.0)
uuid 1.2-1 2024-07-29 [1] CRAN (R 4.5.0)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.5.0)
viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.5.0)
visdat * 0.6.0 2023-02-02 [1] CRAN (R 4.5.0)
vroom 1.6.5 2023-12-05 [1] CRAN (R 4.5.0)
websocket 1.4.4 2025-04-10 [1] CRAN (R 4.5.0)
weights * 1.1.2 2025-06-18 [1] CRAN (R 4.5.0)
withr 3.0.2 2024-10-28 [1] CRAN (R 4.5.0)
xfun 0.52 2025-04-02 [1] CRAN (R 4.5.0)
xml2 1.3.8 2025-03-14 [1] CRAN (R 4.5.0)
xtable 1.8-4 2019-04-21 [1] CRAN (R 4.5.0)
yaml 2.3.10 2024-07-26 [1] CRAN (R 4.5.0)
yulab.utils 0.2.1 2025-08-19 [1] CRAN (R 4.5.0)
zip 2.3.3 2025-05-13 [1] CRAN (R 4.5.0)
zoo 1.8-14 2025-04-10 [1] CRAN (R 4.5.0)
[1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
* ── Packages attached to the search path.
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────